org.apache.parquet.hadoop.ParquetWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet-hadoop Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop;

import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.bytes.ByteBufferAllocator;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
import org.apache.parquet.compression.CompressionCodecFactory;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.ConfigurationUtil;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.schema.MessageType;

/**
 * Write records to a Parquet file.
 */
public class ParquetWriter implements Closeable {

  public static final int DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024;
  public static final int DEFAULT_PAGE_SIZE = ParquetProperties.DEFAULT_PAGE_SIZE;
  public static final CompressionCodecName DEFAULT_COMPRESSION_CODEC_NAME = CompressionCodecName.UNCOMPRESSED;
  public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED;
  public static final boolean DEFAULT_IS_VALIDATING_ENABLED = false;
  public static final WriterVersion DEFAULT_WRITER_VERSION = ParquetProperties.DEFAULT_WRITER_VERSION;

  public static final String OBJECT_MODEL_NAME_PROP = "writer.model.name";

  // max size (bytes) to write as padding and the min size of a row group
  public static final int MAX_PADDING_SIZE_DEFAULT = 8 * 1024 * 1024; // 8MB

  private final InternalParquetRecordWriter writer;
  private final CompressionCodecFactory codecFactory;

  /**
   * Create a new ParquetWriter.
   * (with dictionary encoding enabled and validation off)
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize)
      throws IOException {
    this(
        file,
        writeSupport,
        compressionCodecName,
        blockSize,
        pageSize,
        DEFAULT_IS_DICTIONARY_ENABLED,
        DEFAULT_IS_VALIDATING_ENABLED);
  }

  /**
   * Create a new ParquetWriter.
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold (both data and dictionary)
   * @param enableDictionary     to turn dictionary encoding on
   * @param validating           to turn on validation using the schema
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      boolean enableDictionary,
      boolean validating)
      throws IOException {
    this(file, writeSupport, compressionCodecName, blockSize, pageSize, pageSize, enableDictionary, validating);
  }

  /**
   * Create a new ParquetWriter.
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold
   * @param dictionaryPageSize   the page size threshold for the dictionary pages
   * @param enableDictionary     to turn dictionary encoding on
   * @param validating           to turn on validation using the schema
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      int dictionaryPageSize,
      boolean enableDictionary,
      boolean validating)
      throws IOException {
    this(
        file,
        writeSupport,
        compressionCodecName,
        blockSize,
        pageSize,
        dictionaryPageSize,
        enableDictionary,
        validating,
        DEFAULT_WRITER_VERSION);
  }

  /**
   * Create a new ParquetWriter.
   * 
   * Directly instantiates a Hadoop {@link org.apache.hadoop.conf.Configuration} which reads
   * configuration from the classpath.
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold
   * @param dictionaryPageSize   the page size threshold for the dictionary pages
   * @param enableDictionary     to turn dictionary encoding on
   * @param validating           to turn on validation using the schema
   * @param writerVersion        version of parquetWriter from {@link ParquetProperties.WriterVersion}
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      int dictionaryPageSize,
      boolean enableDictionary,
      boolean validating,
      WriterVersion writerVersion)
      throws IOException {
    this(
        file,
        writeSupport,
        compressionCodecName,
        blockSize,
        pageSize,
        dictionaryPageSize,
        enableDictionary,
        validating,
        writerVersion,
        new Configuration());
  }

  /**
   * Create a new ParquetWriter.
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold
   * @param dictionaryPageSize   the page size threshold for the dictionary pages
   * @param enableDictionary     to turn dictionary encoding on
   * @param validating           to turn on validation using the schema
   * @param writerVersion        version of parquetWriter from {@link ParquetProperties.WriterVersion}
   * @param conf                 Hadoop configuration to use while accessing the filesystem
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      int dictionaryPageSize,
      boolean enableDictionary,
      boolean validating,
      WriterVersion writerVersion,
      Configuration conf)
      throws IOException {
    this(
        file,
        ParquetFileWriter.Mode.CREATE,
        writeSupport,
        compressionCodecName,
        blockSize,
        pageSize,
        dictionaryPageSize,
        enableDictionary,
        validating,
        writerVersion,
        conf);
  }

  /**
   * Create a new ParquetWriter.
   *
   * @param file                 the file to create
   * @param mode                 file creation mode
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold
   * @param dictionaryPageSize   the page size threshold for the dictionary pages
   * @param enableDictionary     to turn dictionary encoding on
   * @param validating           to turn on validation using the schema
   * @param writerVersion        version of parquetWriter from {@link ParquetProperties.WriterVersion}
   * @param conf                 Hadoop configuration to use while accessing the filesystem
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      ParquetFileWriter.Mode mode,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      int dictionaryPageSize,
      boolean enableDictionary,
      boolean validating,
      WriterVersion writerVersion,
      Configuration conf)
      throws IOException {
    this(
        HadoopOutputFile.fromPath(file, conf),
        mode,
        writeSupport,
        compressionCodecName,
        blockSize,
        validating,
        conf,
        MAX_PADDING_SIZE_DEFAULT,
        ParquetProperties.builder()
            .withPageSize(pageSize)
            .withDictionaryPageSize(dictionaryPageSize)
            .withDictionaryEncoding(enableDictionary)
            .withWriterVersion(writerVersion)
            .build(),
        null);
  }

  /**
   * Create a new ParquetWriter. The default block size is 128 MB. The default
   * page size is 1 MB. Default compression is no compression. Dictionary encoding is disabled.
   *
   * @param file         the file to create
   * @param writeSupport the implementation to write a record to a RecordConsumer
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(Path file, WriteSupport writeSupport) throws IOException {
    this(file, writeSupport, DEFAULT_COMPRESSION_CODEC_NAME, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
  }

  @Deprecated
  public ParquetWriter(Path file, Configuration conf, WriteSupport writeSupport) throws IOException {
    this(
        file,
        writeSupport,
        DEFAULT_COMPRESSION_CODEC_NAME,
        DEFAULT_BLOCK_SIZE,
        DEFAULT_PAGE_SIZE,
        DEFAULT_PAGE_SIZE,
        DEFAULT_IS_DICTIONARY_ENABLED,
        DEFAULT_IS_VALIDATING_ENABLED,
        DEFAULT_WRITER_VERSION,
        conf);
  }

  ParquetWriter(
      OutputFile file,
      ParquetFileWriter.Mode mode,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      long rowGroupSize,
      boolean validating,
      Configuration conf,
      int maxPaddingSize,
      ParquetProperties encodingProps,
      FileEncryptionProperties encryptionProperties)
      throws IOException {
    this(
        file,
        mode,
        writeSupport,
        compressionCodecName,
        rowGroupSize,
        validating,
        new HadoopParquetConfiguration(conf),
        maxPaddingSize,
        encodingProps,
        encryptionProperties);
  }

  ParquetWriter(
      OutputFile file,
      ParquetFileWriter.Mode mode,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      long rowGroupSize,
      boolean validating,
      ParquetConfiguration conf,
      int maxPaddingSize,
      ParquetProperties encodingProps,
      FileEncryptionProperties encryptionProperties)
      throws IOException {
    this(
        file,
        mode,
        writeSupport,
        compressionCodecName,
        new CodecFactory(conf, encodingProps.getPageSizeThreshold()),
        rowGroupSize,
        validating,
        conf,
        maxPaddingSize,
        encodingProps,
        encryptionProperties);
  }

  ParquetWriter(
      OutputFile file,
      ParquetFileWriter.Mode mode,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      CompressionCodecFactory codecFactory,
      long rowGroupSize,
      boolean validating,
      ParquetConfiguration conf,
      int maxPaddingSize,
      ParquetProperties encodingProps,
      FileEncryptionProperties encryptionProperties)
      throws IOException {
    WriteSupport.WriteContext writeContext = writeSupport.init(conf);
    MessageType schema = writeContext.getSchema();

    // encryptionProperties could be built from the implementation of EncryptionPropertiesFactory when it is
    // attached.
    if (encryptionProperties == null) {
      encryptionProperties = EncryptionPropertiesHelper.createEncryptionProperties(conf, file, writeContext);
    }

    ParquetFileWriter fileWriter = new ParquetFileWriter(
        file, schema, mode, rowGroupSize, maxPaddingSize, encryptionProperties, encodingProps);
    fileWriter.start();

    this.codecFactory = codecFactory;
    CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(compressionCodecName);

    final Map extraMetadata;
    if (encodingProps.getExtraMetaData() == null
        || encodingProps.getExtraMetaData().isEmpty()) {
      extraMetadata = writeContext.getExtraMetaData();
    } else {
      extraMetadata = new HashMap<>(writeContext.getExtraMetaData());

      encodingProps.getExtraMetaData().forEach((metadataKey, metadataValue) -> {
        if (metadataKey.equals(OBJECT_MODEL_NAME_PROP)) {
          throw new IllegalArgumentException("Cannot overwrite metadata key " + OBJECT_MODEL_NAME_PROP
              + ". Please use another key name.");
        }

        if (extraMetadata.put(metadataKey, metadataValue) != null) {
          throw new IllegalArgumentException(
              "Duplicate metadata key " + metadataKey + ". Please use another key name.");
        }
      });
    }

    this.writer = new InternalParquetRecordWriter(
        fileWriter, writeSupport, schema, extraMetadata, rowGroupSize, compressor, validating, encodingProps);
  }

  public void write(T object) throws IOException {
    try {
      writer.write(object);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }

  @Override
  public void close() throws IOException {
    try {
      writer.close();
    } catch (InterruptedException e) {
      throw new IOException(e);
    } finally {
      // release after the writer closes in case it is used for a last flush
      codecFactory.release();
    }
  }

  /**
   * @return the ParquetMetadata written to the (closed) file.
   */
  public ParquetMetadata getFooter() {
    return writer.getFooter();
  }

  /**
   * @return the total size of data written to the file and buffered in memory
   */
  public long getDataSize() {
    return writer.getDataSize();
  }

  /**
   * An abstract builder class for ParquetWriter instances.
   * 
   * Object models should extend this builder to provide writer configuration
   * options.
   *
   * @param     The type of objects written by the constructed ParquetWriter.
   * @param  The type of this builder that is returned by builder methods
   */
  public abstract static class Builder> {
    private OutputFile file = null;
    private Path path = null;
    private FileEncryptionProperties encryptionProperties = null;
    private ParquetConfiguration conf = null;
    private ParquetFileWriter.Mode mode;
    private CompressionCodecFactory codecFactory = null;
    private CompressionCodecName codecName = DEFAULT_COMPRESSION_CODEC_NAME;
    private long rowGroupSize = DEFAULT_BLOCK_SIZE;
    private int maxPaddingSize = MAX_PADDING_SIZE_DEFAULT;
    private boolean enableValidation = DEFAULT_IS_VALIDATING_ENABLED;
    private ParquetProperties.Builder encodingPropsBuilder = ParquetProperties.builder();

    protected Builder(Path path) {
      this.path = path;
    }

    protected Builder(OutputFile path) {
      this.file = path;
    }

    /**
     * @return this as the correct subclass of ParquetWriter.Builder.
     */
    protected abstract SELF self();

    /**
     * @param conf a configuration
     * @return an appropriate WriteSupport for the object model.
     * @deprecated Use {@link #getWriteSupport(ParquetConfiguration)} instead
     */
    @Deprecated
    protected abstract WriteSupport getWriteSupport(Configuration conf);

    /**
     * @param conf a configuration
     * @return an appropriate WriteSupport for the object model.
     */
    protected WriteSupport getWriteSupport(ParquetConfiguration conf) {
      return getWriteSupport(ConfigurationUtil.createHadoopConfiguration(conf));
    }

    /**
     * Set the {@link Configuration} used by the constructed writer.
     *
     * @param conf a {@code Configuration}
     * @return this builder for method chaining.
     */
    public SELF withConf(Configuration conf) {
      this.conf = new HadoopParquetConfiguration(conf);
      return self();
    }

    /**
     * Set the {@link ParquetConfiguration} used by the constructed writer.
     *
     * @param conf a {@code ParquetConfiguration}
     * @return this builder for method chaining.
     */
    public SELF withConf(ParquetConfiguration conf) {
      this.conf = conf;
      return self();
    }

    /**
     * Set the {@link ParquetFileWriter.Mode write mode} used when creating the
     * backing file for this writer.
     *
     * @param mode a {@code ParquetFileWriter.Mode}
     * @return this builder for method chaining.
     */
    public SELF withWriteMode(ParquetFileWriter.Mode mode) {
      this.mode = mode;
      return self();
    }

    /**
     * Set the {@link CompressionCodecName compression codec} used by the
     * constructed writer.
     *
     * @param codecName a {@code CompressionCodecName}
     * @return this builder for method chaining.
     */
    public SELF withCompressionCodec(CompressionCodecName codecName) {
      this.codecName = codecName;
      return self();
    }

    /**
     * Set the {@link CompressionCodecFactory codec factory} used by the
     * constructed writer.
     *
     * @param codecFactory a {@link CompressionCodecFactory}
     * @return this builder for method chaining.
     */
    public SELF withCodecFactory(CompressionCodecFactory codecFactory) {
      this.codecFactory = codecFactory;
      return self();
    }

    /**
     * Set the {@link FileEncryptionProperties file encryption properties} used by the
     * constructed writer.
     *
     * @param encryptionProperties a {@code FileEncryptionProperties}
     * @return this builder for method chaining.
     */
    public SELF withEncryption(FileEncryptionProperties encryptionProperties) {
      this.encryptionProperties = encryptionProperties;
      return self();
    }

    /**
     * Set the Parquet format row group size used by the constructed writer.
     *
     * @param rowGroupSize an integer size in bytes
     * @return this builder for method chaining.
     * @deprecated Use {@link #withRowGroupSize(long)} instead
     */
    @Deprecated
    public SELF withRowGroupSize(int rowGroupSize) {
      return withRowGroupSize((long) rowGroupSize);
    }

    /**
     * Set the Parquet format row group size used by the constructed writer.
     *
     * @param rowGroupSize an integer size in bytes
     * @return this builder for method chaining.
     */
    public SELF withRowGroupSize(long rowGroupSize) {
      this.rowGroupSize = rowGroupSize;
      return self();
    }

    /**
     * Set the Parquet format page size used by the constructed writer.
     *
     * @param pageSize an integer size in bytes
     * @return this builder for method chaining.
     */
    public SELF withPageSize(int pageSize) {
      encodingPropsBuilder.withPageSize(pageSize);
      return self();
    }

    /**
     * Sets the Parquet format page row count limit used by the constructed writer.
     *
     * @param rowCount limit for the number of rows stored in a page
     * @return this builder for method chaining
     */
    public SELF withPageRowCountLimit(int rowCount) {
      encodingPropsBuilder.withPageRowCountLimit(rowCount);
      return self();
    }

    /**
     * Set the Parquet format dictionary page size used by the constructed
     * writer.
     *
     * @param dictionaryPageSize an integer size in bytes
     * @return this builder for method chaining.
     */
    public SELF withDictionaryPageSize(int dictionaryPageSize) {
      encodingPropsBuilder.withDictionaryPageSize(dictionaryPageSize);
      return self();
    }

    /**
     * Set the maximum amount of padding, in bytes, that will be used to align
     * row groups with blocks in the underlying filesystem. If the underlying
     * filesystem is not a block filesystem like HDFS, this has no effect.
     *
     * @param maxPaddingSize an integer size in bytes
     * @return this builder for method chaining.
     */
    public SELF withMaxPaddingSize(int maxPaddingSize) {
      this.maxPaddingSize = maxPaddingSize;
      return self();
    }

    /**
     * Enables dictionary encoding for the constructed writer.
     *
     * @return this builder for method chaining.
     */
    public SELF enableDictionaryEncoding() {
      encodingPropsBuilder.withDictionaryEncoding(true);
      return self();
    }

    /**
     * Enable or disable dictionary encoding for the constructed writer.
     *
     * @param enableDictionary whether dictionary encoding should be enabled
     * @return this builder for method chaining.
     */
    public SELF withDictionaryEncoding(boolean enableDictionary) {
      encodingPropsBuilder.withDictionaryEncoding(enableDictionary);
      return self();
    }

    public SELF withByteStreamSplitEncoding(boolean enableByteStreamSplit) {
      encodingPropsBuilder.withByteStreamSplitEncoding(enableByteStreamSplit);
      return self();
    }

    /**
     * Enable or disable dictionary encoding of the specified column for the constructed writer.
     *
     * @param columnPath       the path of the column (dot-string)
     * @param enableDictionary whether dictionary encoding should be enabled
     * @return this builder for method chaining.
     */
    public SELF withDictionaryEncoding(String columnPath, boolean enableDictionary) {
      encodingPropsBuilder.withDictionaryEncoding(columnPath, enableDictionary);
      return self();
    }

    /**
     * Enables validation for the constructed writer.
     *
     * @return this builder for method chaining.
     */
    public SELF enableValidation() {
      this.enableValidation = true;
      return self();
    }

    /**
     * Enable or disable validation for the constructed writer.
     *
     * @param enableValidation whether validation should be enabled
     * @return this builder for method chaining.
     */
    public SELF withValidation(boolean enableValidation) {
      this.enableValidation = enableValidation;
      return self();
    }

    /**
     * Set the {@link WriterVersion format version} used by the constructed
     * writer.
     *
     * @param version a {@code WriterVersion}
     * @return this builder for method chaining.
     */
    public SELF withWriterVersion(WriterVersion version) {
      encodingPropsBuilder.withWriterVersion(version);
      return self();
    }

    /**
     * Enables writing page level checksums for the constructed writer.
     *
     * @return this builder for method chaining.
     */
    public SELF enablePageWriteChecksum() {
      encodingPropsBuilder.withPageWriteChecksumEnabled(true);
      return self();
    }

    /**
     * Enables writing page level checksums for the constructed writer.
     *
     * @param enablePageWriteChecksum whether page checksums should be written out
     * @return this builder for method chaining.
     */
    public SELF withPageWriteChecksumEnabled(boolean enablePageWriteChecksum) {
      encodingPropsBuilder.withPageWriteChecksumEnabled(enablePageWriteChecksum);
      return self();
    }

    /**
     * Set max Bloom filter bytes for related columns.
     *
     * @param maxBloomFilterBytes the max bytes of a Bloom filter bitset for a column.
     * @return this builder for method chaining
     */
    public SELF withMaxBloomFilterBytes(int maxBloomFilterBytes) {
      encodingPropsBuilder.withMaxBloomFilterBytes(maxBloomFilterBytes);
      return self();
    }

    /**
     * Sets the NDV (number of distinct values) for the specified column.
     *
     * @param columnPath the path of the column (dot-string)
     * @param ndv        the NDV of the column
     * @return this builder for method chaining.
     */
    public SELF withBloomFilterNDV(String columnPath, long ndv) {
      encodingPropsBuilder.withBloomFilterNDV(columnPath, ndv);

      return self();
    }

    public SELF withBloomFilterFPP(String columnPath, double fpp) {
      encodingPropsBuilder.withBloomFilterFPP(columnPath, fpp);
      return self();
    }

    /**
     * When NDV (number of distinct values) for a specified column is not set, whether to use
     * `AdaptiveBloomFilter` to automatically adjust the BloomFilter size according to `parquet.bloom.filter.max.bytes`
     *
     * @param enabled whether to write bloom filter for the column
     */
    public SELF withAdaptiveBloomFilterEnabled(boolean enabled) {
      encodingPropsBuilder.withAdaptiveBloomFilterEnabled(enabled);
      return self();
    }

    /**
     * When `AdaptiveBloomFilter` is enabled, set how many bloom filter candidates to use.
     *
     * @param columnPath the path of the column (dot-string)
     * @param number     the number of candidate
     */
    public SELF withBloomFilterCandidateNumber(String columnPath, int number) {
      encodingPropsBuilder.withBloomFilterCandidatesNumber(columnPath, number);
      return self();
    }

    /**
     * Sets the bloom filter enabled/disabled
     *
     * @param enabled whether to write bloom filters
     * @return this builder for method chaining
     */
    public SELF withBloomFilterEnabled(boolean enabled) {
      encodingPropsBuilder.withBloomFilterEnabled(enabled);
      return self();
    }

    /**
     * Sets the bloom filter enabled/disabled for the specified column. If not set for the column specifically the
     * default enabled/disabled state will take place. See {@link #withBloomFilterEnabled(boolean)}.
     *
     * @param columnPath the path of the column (dot-string)
     * @param enabled    whether to write bloom filter for the column
     * @return this builder for method chaining
     */
    public SELF withBloomFilterEnabled(String columnPath, boolean enabled) {
      encodingPropsBuilder.withBloomFilterEnabled(columnPath, enabled);
      return self();
    }

    /**
     * Sets the minimum number of rows to write before a page size check is done.
     *
     * @param min writes at least `min` rows before invoking a page size check
     * @return this builder for method chaining
     */
    public SELF withMinRowCountForPageSizeCheck(int min) {
      encodingPropsBuilder.withMinRowCountForPageSizeCheck(min);
      return self();
    }

    /**
     * Sets the maximum number of rows to write before a page size check is done.
     *
     * @param max makes a page size check after `max` rows have been written
     * @return this builder for method chaining
     */
    public SELF withMaxRowCountForPageSizeCheck(int max) {
      encodingPropsBuilder.withMaxRowCountForPageSizeCheck(max);
      return self();
    }

    /**
     * Sets the length to be used for truncating binary values in a binary column index.
     *
     * @param length the length to truncate to
     * @return this builder for method chaining
     */
    public SELF withColumnIndexTruncateLength(int length) {
      encodingPropsBuilder.withColumnIndexTruncateLength(length);
      return self();
    }

    /**
     * Sets the length which the min/max binary values in row groups are truncated to.
     *
     * @param length the length to truncate to
     * @return this builder for method chaining
     */
    public SELF withStatisticsTruncateLength(int length) {
      encodingPropsBuilder.withStatisticsTruncateLength(length);
      return self();
    }

    /**
     * Sets additional metadata entries to be included in the file footer.
     *
     * @param extraMetaData a Map of additional stringly-typed metadata entries
     * @return this builder for method chaining
     */
    public SELF withExtraMetaData(Map extraMetaData) {
      encodingPropsBuilder.withExtraMetaData(extraMetaData);
      return self();
    }

    /**
     * Sets the ByteBuffer allocator instance to be used for allocating memory for writing.
     *
     * @param allocator the allocator instance
     * @return this builder for method chaining
     */
    public SELF withAllocator(ByteBufferAllocator allocator) {
      encodingPropsBuilder.withAllocator(allocator);
      return self();
    }

    /**
     * Set a property that will be available to the read path. For writers that use a Hadoop
     * configuration, this is the recommended way to add configuration values.
     *
     * @param property a String property name
     * @param value    a String property value
     * @return this builder for method chaining.
     */
    public SELF config(String property, String value) {
      if (conf == null) {
        conf = new HadoopParquetConfiguration();
      }
      conf.set(property, value);
      return self();
    }

    /**
     * Sets the statistics enabled/disabled for the specified column. All column statistics are enabled by default.
     *
     * @param columnPath the path of the column (dot-string)
     * @param enabled    whether to write calculate statistics for the column
     * @return this builder for method chaining
     */
    public SELF withStatisticsEnabled(String columnPath, boolean enabled) {
      encodingPropsBuilder.withStatisticsEnabled(columnPath, enabled);
      return self();
    }

    /**
     * Sets whether statistics are enabled globally. When disabled, statistics will not be collected
     * for any column unless explicitly enabled for specific columns.
     *
     * @param enabled whether to collect statistics globally
     * @return this builder for method chaining
     */
    public SELF withStatisticsEnabled(boolean enabled) {
      encodingPropsBuilder.withStatisticsEnabled(enabled);
      return self();
    }

    /**
     * Sets the size statistics enabled/disabled for the specified column. All column size statistics are enabled by default.
     *
     * @param columnPath the path of the column (dot-string)
     * @param enabled    whether to collect size statistics for the column
     * @return this builder for method chaining
     */
    public SELF withSizeStatisticsEnabled(String columnPath, boolean enabled) {
      encodingPropsBuilder.withSizeStatisticsEnabled(columnPath, enabled);
      return self();
    }

    /**
     * Sets whether size statistics are enabled globally. When disabled, size statistics will not be collected
     * for any column unless explicitly enabled for specific columns.
     *
     * @param enabled whether to collect size statistics globally
     * @return this builder for method chaining
     */
    public SELF withSizeStatisticsEnabled(boolean enabled) {
      encodingPropsBuilder.withSizeStatisticsEnabled(enabled);
      return self();
    }

    /**
     * Build a {@link ParquetWriter} with the accumulated configuration.
     *
     * @return a configured {@code ParquetWriter} instance.
     * @throws IOException if there is an error while creating the writer
     */
    public ParquetWriter build() throws IOException {
      if (conf == null) {
        conf = new HadoopParquetConfiguration();
      }
      ParquetProperties encodingProps = encodingPropsBuilder.build();
      if (codecFactory == null) {
        codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold());
      }

      return new ParquetWriter<>(
          (file != null)
              ? file
              : HadoopOutputFile.fromPath(path, ConfigurationUtil.createHadoopConfiguration(conf)),
          mode,
          getWriteSupport(conf),
          codecName,
          codecFactory,
          rowGroupSize,
          enableValidation,
          conf,
          maxPaddingSize,
          encodingProps,
          encryptionProperties);
    }
  }
}