All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.config.HoodieStorageConfig Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.config;

import org.apache.hudi.common.bloom.BloomFilterTypeCode;

import javax.annotation.concurrent.Immutable;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;

/**
 * Storage related config.
 */
@Immutable
@ConfigClassProperty(name = "Storage Configs",
    groupName = ConfigGroups.Names.WRITE_CLIENT,
    description = "Configurations that control aspects around writing, sizing, reading base and log files.")
public class HoodieStorageConfig extends HoodieConfig {

  public static final ConfigProperty PARQUET_MAX_FILE_SIZE = ConfigProperty
      .key("hoodie.parquet.max.file.size")
      .defaultValue(String.valueOf(120 * 1024 * 1024))
      .withDocumentation("Target size in bytes for parquet files produced by Hudi write phases. "
          + "For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.");

  public static final ConfigProperty PARQUET_BLOCK_SIZE = ConfigProperty
      .key("hoodie.parquet.block.size")
      .defaultValue(String.valueOf(120 * 1024 * 1024))
      .markAdvanced()
      .withDocumentation("Parquet RowGroup size in bytes. It's recommended to make this large enough that scan costs can be"
          + " amortized by packing enough column values into a single row group.");

  public static final ConfigProperty PARQUET_PAGE_SIZE = ConfigProperty
      .key("hoodie.parquet.page.size")
      .defaultValue(String.valueOf(1 * 1024 * 1024))
      .markAdvanced()
      .withDocumentation("Parquet page size in bytes. Page is the unit of read within a parquet file. "
          + "Within a block, pages are compressed separately.");

  public static final ConfigProperty ORC_FILE_MAX_SIZE = ConfigProperty
      .key("hoodie.orc.max.file.size")
      .defaultValue(String.valueOf(120 * 1024 * 1024))
      .markAdvanced()
      .withDocumentation("Target file size in bytes for ORC base files.");

  public static final ConfigProperty ORC_STRIPE_SIZE = ConfigProperty
      .key("hoodie.orc.stripe.size")
      .defaultValue(String.valueOf(64 * 1024 * 1024))
      .markAdvanced()
      .withDocumentation("Size of the memory buffer in bytes for writing");

  public static final ConfigProperty ORC_BLOCK_SIZE = ConfigProperty
      .key("hoodie.orc.block.size")
      .defaultValue(ORC_FILE_MAX_SIZE.defaultValue())
      .markAdvanced()
      .withDocumentation("ORC block size, recommended to be aligned with the target file size.");

  public static final ConfigProperty HFILE_MAX_FILE_SIZE = ConfigProperty
      .key("hoodie.hfile.max.file.size")
      .defaultValue(String.valueOf(120 * 1024 * 1024))
      .markAdvanced()
      .withDocumentation("Target file size in bytes for HFile base files.");

  public static final ConfigProperty HFILE_BLOCK_SIZE = ConfigProperty
      .key("hoodie.hfile.block.size")
      .defaultValue(String.valueOf(1024 * 1024))
      .markAdvanced()
      .withDocumentation("Lower values increase the size in bytes of metadata tracked within HFile, but can offer potentially "
          + "faster lookup times.");

  public static final ConfigProperty LOGFILE_DATA_BLOCK_FORMAT = ConfigProperty
      .key("hoodie.logfile.data.block.format")
      .noDefaultValue()
      .markAdvanced()
      .withDocumentation("Format of the data block within delta logs. Following formats are currently supported \"avro\", \"hfile\", \"parquet\"");

  public static final ConfigProperty LOGFILE_MAX_SIZE = ConfigProperty
      .key("hoodie.logfile.max.size")
      .defaultValue(String.valueOf(1024 * 1024 * 1024)) // 1 GB
      .markAdvanced()
      .withDocumentation("LogFile max size in bytes. This is the maximum size allowed for a log file "
          + "before it is rolled over to the next version.");

  public static final ConfigProperty LOGFILE_DATA_BLOCK_MAX_SIZE = ConfigProperty
      .key("hoodie.logfile.data.block.max.size")
      .defaultValue(String.valueOf(256 * 1024 * 1024))
      .markAdvanced()
      .withDocumentation("LogFile Data block max size in bytes. This is the maximum size allowed for a single data block "
          + "to be appended to a log file. This helps to make sure the data appended to the log file is broken up "
          + "into sizable blocks to prevent from OOM errors. This size should be greater than the JVM memory.");

  public static final ConfigProperty PARQUET_COMPRESSION_RATIO_FRACTION = ConfigProperty
      .key("hoodie.parquet.compression.ratio")
      .defaultValue(String.valueOf(0.1))
      .markAdvanced()
      .withDocumentation("Expected compression of parquet data used by Hudi, when it tries to size new parquet files. "
          + "Increase this value, if bulk_insert is producing smaller than expected sized files");

  // Default compression codec for parquet
  public static final ConfigProperty PARQUET_COMPRESSION_CODEC_NAME = ConfigProperty
      .key("hoodie.parquet.compression.codec")
      .defaultValue("gzip")
      .withDocumentation("Compression Codec for parquet files");

  public static final ConfigProperty PARQUET_DICTIONARY_ENABLED = ConfigProperty
      .key("hoodie.parquet.dictionary.enabled")
      .defaultValue(true)
      .markAdvanced()
      .withDocumentation("Whether to use dictionary encoding");

  public static final ConfigProperty PARQUET_WRITE_LEGACY_FORMAT_ENABLED = ConfigProperty
      .key("hoodie.parquet.writelegacyformat.enabled")
      .defaultValue("false")
      .markAdvanced()
      .withDocumentation("Sets spark.sql.parquet.writeLegacyFormat. If true, data will be written in a way of Spark 1.4 and earlier. "
          + "For example, decimal values will be written in Parquet's fixed-length byte array format which other systems such as Apache Hive and Apache Impala use. "
          + "If false, the newer format in Parquet will be used. For example, decimals will be written in int-based format.");

  public static final ConfigProperty PARQUET_OUTPUT_TIMESTAMP_TYPE = ConfigProperty
      .key("hoodie.parquet.outputtimestamptype")
      .defaultValue("TIMESTAMP_MICROS")
      .markAdvanced()
      .withDocumentation("Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files.");

  // SPARK-38094 Spark 3.3 checks if this field is enabled. Hudi has to provide this or there would be NPE thrown
  // Would ONLY be effective with Spark 3.3+
  // default value is true which is in accordance with Spark 3.3
  public static final ConfigProperty PARQUET_FIELD_ID_WRITE_ENABLED = ConfigProperty
      .key("hoodie.parquet.field_id.write.enabled")
      .defaultValue("true")
      .markAdvanced()
      .sinceVersion("0.12.0")
      .withDocumentation("Would only be effective with Spark 3.3+. Sets spark.sql.parquet.fieldId.write.enabled. "
          + "If enabled, Spark will write out parquet native field ids that are stored inside StructField's metadata as parquet.field.id to parquet files.");

  public static final ConfigProperty PARQUET_WITH_BLOOM_FILTER_ENABLED = ConfigProperty
      .key("hoodie.parquet.bloom.filter.enabled")
      .defaultValue(true)
      .markAdvanced()
      .sinceVersion("0.15.0")
      .withDocumentation("Control whether to write bloom filter or not. Default true. "
          + "We can set to false in non bloom index cases for CPU resource saving.");

  public static final ConfigProperty HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty
      .key("hoodie.hfile.compression.algorithm")
      .defaultValue("GZ")
      .markAdvanced()
      .withDocumentation("Compression codec to use for hfile base files.");

  public static final ConfigProperty ORC_COMPRESSION_CODEC_NAME = ConfigProperty
      .key("hoodie.orc.compression.codec")
      .defaultValue("ZLIB")
      .markAdvanced()
      .withDocumentation("Compression codec to use for ORC base files.");

  // Default compression ratio for log file to parquet, general 3x
  public static final ConfigProperty LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION = ConfigProperty
      .key("hoodie.logfile.to.parquet.compression.ratio")
      .defaultValue(String.valueOf(0.35))
      .markAdvanced()
      .withDocumentation("Expected additional compression as records move from log files to parquet. Used for merge_on_read "
          + "table to send inserts into log files & control the size of compacted parquet file.");

  // Configs that control the bloom filter that is written to the file footer
  public static final ConfigProperty BLOOM_FILTER_TYPE = ConfigProperty
      .key("hoodie.bloom.index.filter.type")
      .defaultValue(BloomFilterTypeCode.DYNAMIC_V0.name())
      .withValidValues(BloomFilterTypeCode.SIMPLE.name(), BloomFilterTypeCode.DYNAMIC_V0.name())
      .markAdvanced()
      .withDocumentation(BloomFilterTypeCode.class);

  public static final ConfigProperty BLOOM_FILTER_NUM_ENTRIES_VALUE = ConfigProperty
      .key("hoodie.index.bloom.num_entries")
      .defaultValue("60000")
      .markAdvanced()
      .withDocumentation("Only applies if index type is BLOOM. "
          + "This is the number of entries to be stored in the bloom filter. "
          + "The rationale for the default: Assume the maxParquetFileSize is 128MB and averageRecordSize is 1kb and "
          + "hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. "
          + "Warning: Setting this very low, will generate a lot of false positives and index lookup "
          + "will have to scan a lot more files than it has to and setting this to a very high number will "
          + "increase the size every base file linearly (roughly 4KB for every 50000 entries). "
          + "This config is also used with DYNAMIC bloom filter which determines the initial size for the bloom.");

  public static final ConfigProperty BLOOM_FILTER_FPP_VALUE = ConfigProperty
      .key("hoodie.index.bloom.fpp")
      .defaultValue("0.000000001")
      .markAdvanced()
      .withDocumentation("Only applies if index type is BLOOM. "
          + "Error rate allowed given the number of entries. This is used to calculate how many bits should be "
          + "assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), "
          + "we like to tradeoff disk space for lower false positives. "
          + "If the number of entries added to bloom filter exceeds the configured value (hoodie.index.bloom.num_entries), "
          + "then this fpp may not be honored.");

  public static final ConfigProperty BLOOM_FILTER_DYNAMIC_MAX_ENTRIES = ConfigProperty
      .key("hoodie.bloom.index.filter.dynamic.max.entries")
      .defaultValue("100000")
      .markAdvanced()
      .withDocumentation("The threshold for the maximum number of keys to record in a dynamic Bloom filter row. "
          + "Only applies if filter type is BloomFilterTypeCode.DYNAMIC_V0.");

  public static final ConfigProperty HOODIE_AVRO_WRITE_SUPPORT_CLASS = ConfigProperty
      .key("hoodie.avro.write.support.class")
      .defaultValue("org.apache.hudi.avro.HoodieAvroWriteSupport")
      .markAdvanced()
      .sinceVersion("0.14.0")
      .withDocumentation("Provided write support class should extend HoodieAvroWriteSupport class "
          + "and it is loaded at runtime. This is only required when trying to "
          + "override the existing write context.");

  public static final ConfigProperty HOODIE_PARQUET_SPARK_ROW_WRITE_SUPPORT_CLASS = ConfigProperty
      .key("hoodie.parquet.spark.row.write.support.class")
      .defaultValue("org.apache.hudi.io.storage.row.HoodieRowParquetWriteSupport")
      .markAdvanced()
      .sinceVersion("0.15.0")
      .withDocumentation("Provided write support class should extend HoodieRowParquetWriteSupport class "
          + "and it is loaded at runtime. This is only required when trying to "
          + "override the existing write context when `hoodie.datasource.write.row.writer.enable=true`.");

  public static final ConfigProperty HOODIE_STORAGE_CLASS = ConfigProperty
      .key("hoodie.storage.class")
      .defaultValue("org.apache.hudi.storage.hadoop.HoodieHadoopStorage")
      .markAdvanced()
      .sinceVersion("0.15.0")
      .withDocumentation("The fully-qualified class name of the `HoodieStorage` implementation class to instantiate. "
          + "The provided class should implement `org.apache.hudi.storage.HoodieStorage`");

  public static final ConfigProperty HOODIE_IO_FACTORY_CLASS = ConfigProperty
      .key("hoodie.io.factory.class")
      .defaultValue("org.apache.hudi.io.hadoop.HoodieHadoopIOFactory")
      .markAdvanced()
      .sinceVersion("0.15.0")
      .withDocumentation("The fully-qualified class name of the factory class to return readers and writers of files used "
          + "by Hudi. The provided class should implement `org.apache.hudi.io.storage.HoodieIOFactory`.");

  /**
   * @deprecated Use {@link #PARQUET_MAX_FILE_SIZE} and its methods instead
   */
  @Deprecated
  public static final String PARQUET_FILE_MAX_BYTES = PARQUET_MAX_FILE_SIZE.key();
  /**
   * @deprecated Use {@link #PARQUET_MAX_FILE_SIZE} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = PARQUET_MAX_FILE_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #PARQUET_BLOCK_SIZE} and its methods instead
   */
  @Deprecated
  public static final String PARQUET_BLOCK_SIZE_BYTES = PARQUET_BLOCK_SIZE.key();
  /**
   * @deprecated Use {@link #PARQUET_BLOCK_SIZE} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_PARQUET_BLOCK_SIZE_BYTES = PARQUET_BLOCK_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #PARQUET_PAGE_SIZE} and its methods instead
   */
  @Deprecated
  public static final String PARQUET_PAGE_SIZE_BYTES = PARQUET_PAGE_SIZE.key();
  /**
   * @deprecated Use {@link #PARQUET_PAGE_SIZE} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = PARQUET_PAGE_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #HFILE_MAX_FILE_SIZE} and its methods instead
   */
  @Deprecated
  public static final String HFILE_FILE_MAX_BYTES = HFILE_MAX_FILE_SIZE.key();
  /**
   * @deprecated Use {@link #HFILE_MAX_FILE_SIZE} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_HFILE_FILE_MAX_BYTES = HFILE_MAX_FILE_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #HFILE_BLOCK_SIZE} and its methods instead
   */
  @Deprecated
  public static final String HFILE_BLOCK_SIZE_BYTES = HFILE_BLOCK_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #HFILE_BLOCK_SIZE} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_HFILE_BLOCK_SIZE_BYTES = HFILE_BLOCK_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #LOGFILE_MAX_SIZE} and its methods instead
   */
  @Deprecated
  public static final String LOGFILE_SIZE_MAX_BYTES = LOGFILE_MAX_SIZE.key();
  /**
   * @deprecated Use {@link #LOGFILE_MAX_SIZE} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_LOGFILE_SIZE_MAX_BYTES = LOGFILE_MAX_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #LOGFILE_DATA_BLOCK_MAX_SIZE} and its methods instead
   */
  @Deprecated
  public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = LOGFILE_DATA_BLOCK_MAX_SIZE.key();
  /**
   * @deprecated Use {@link #LOGFILE_DATA_BLOCK_MAX_SIZE} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = LOGFILE_DATA_BLOCK_MAX_SIZE.defaultValue();
  /**
   * @deprecated Use {@link #PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead
   */
  @Deprecated
  public static final String PARQUET_COMPRESSION_RATIO = PARQUET_COMPRESSION_RATIO_FRACTION.key();
  /**
   * @deprecated Use {@link #PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_STREAM_COMPRESSION_RATIO = PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue();
  /**
   * @deprecated Use {@link #PARQUET_COMPRESSION_CODEC_NAME} and its methods instead
   */
  @Deprecated
  public static final String PARQUET_COMPRESSION_CODEC = PARQUET_COMPRESSION_CODEC_NAME.key();
  /**
   * @deprecated Use {@link #HFILE_COMPRESSION_ALGORITHM_NAME} and its methods instead
   */
  @Deprecated
  public static final String HFILE_COMPRESSION_ALGORITHM = HFILE_COMPRESSION_ALGORITHM_NAME.key();
  /**
   * @deprecated Use {@link #PARQUET_COMPRESSION_CODEC_NAME} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_PARQUET_COMPRESSION_CODEC = PARQUET_COMPRESSION_CODEC_NAME.defaultValue();
  /**
   * @deprecated Use {@link #HFILE_COMPRESSION_ALGORITHM_NAME} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_HFILE_COMPRESSION_ALGORITHM = HFILE_COMPRESSION_ALGORITHM_NAME.defaultValue();
  /**
   * @deprecated Use {@link #LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead
   */
  @Deprecated
  public static final String LOGFILE_TO_PARQUET_COMPRESSION_RATIO = LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION.key();
  /**
   * @deprecated Use {@link #LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION} and its methods instead
   */
  @Deprecated
  public static final String DEFAULT_LOGFILE_TO_PARQUET_COMPRESSION_RATIO = LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue();

  private HoodieStorageConfig() {
    super();
  }

  public static HoodieStorageConfig.Builder newBuilder() {
    return new Builder();
  }

  public static class Builder {

    private final HoodieStorageConfig storageConfig = new HoodieStorageConfig();

    public Builder fromFile(File propertiesFile) throws IOException {
      try (FileReader reader = new FileReader(propertiesFile)) {
        this.storageConfig.getProps().load(reader);
        return this;
      }
    }

    public Builder fromProperties(Properties props) {
      this.storageConfig.getProps().putAll(props);
      return this;
    }

    public Builder parquetMaxFileSize(long maxFileSize) {
      storageConfig.setValue(PARQUET_MAX_FILE_SIZE, String.valueOf(maxFileSize));
      return this;
    }

    public Builder parquetBlockSize(int blockSize) {
      storageConfig.setValue(PARQUET_BLOCK_SIZE, String.valueOf(blockSize));
      return this;
    }

    public Builder parquetPageSize(int pageSize) {
      storageConfig.setValue(PARQUET_PAGE_SIZE, String.valueOf(pageSize));
      return this;
    }

    public Builder hfileMaxFileSize(long maxFileSize) {
      storageConfig.setValue(HFILE_MAX_FILE_SIZE, String.valueOf(maxFileSize));
      return this;
    }

    public Builder hfileBlockSize(int blockSize) {
      storageConfig.setValue(HFILE_BLOCK_SIZE, String.valueOf(blockSize));
      return this;
    }

    public Builder logFileDataBlockFormat(String format) {
      storageConfig.setValue(LOGFILE_DATA_BLOCK_FORMAT, format);
      return this;
    }

    public Builder logFileDataBlockMaxSize(long dataBlockSize) {
      storageConfig.setValue(LOGFILE_DATA_BLOCK_MAX_SIZE, String.valueOf(dataBlockSize));
      return this;
    }

    public Builder logFileMaxSize(long logFileSize) {
      storageConfig.setValue(LOGFILE_MAX_SIZE, String.valueOf(logFileSize));
      return this;
    }

    public Builder parquetCompressionRatio(double parquetCompressionRatio) {
      storageConfig.setValue(PARQUET_COMPRESSION_RATIO_FRACTION, String.valueOf(parquetCompressionRatio));
      return this;
    }

    public Builder parquetCompressionCodec(String parquetCompressionCodec) {
      storageConfig.setValue(PARQUET_COMPRESSION_CODEC_NAME, parquetCompressionCodec);
      return this;
    }

    public Builder parquetWriteLegacyFormat(String parquetWriteLegacyFormat) {
      storageConfig.setValue(PARQUET_WRITE_LEGACY_FORMAT_ENABLED, parquetWriteLegacyFormat);
      return this;
    }

    public Builder parquetOutputTimestampType(String parquetOutputTimestampType) {
      storageConfig.setValue(PARQUET_OUTPUT_TIMESTAMP_TYPE, parquetOutputTimestampType);
      return this;
    }

    public Builder parquetFieldIdWrite(String parquetFieldIdWrite) {
      storageConfig.setValue(PARQUET_FIELD_ID_WRITE_ENABLED, parquetFieldIdWrite);
      return this;
    }

    public Builder parquetBloomFilterEnable(boolean parquetBloomFilterEnable) {
      storageConfig.setValue(PARQUET_WITH_BLOOM_FILTER_ENABLED, String.valueOf(parquetBloomFilterEnable));
      return this;
    }

    public Builder hfileCompressionAlgorithm(String hfileCompressionAlgorithm) {
      storageConfig.setValue(HFILE_COMPRESSION_ALGORITHM_NAME, hfileCompressionAlgorithm);
      return this;
    }

    public Builder logFileToParquetCompressionRatio(double logFileToParquetCompressionRatio) {
      storageConfig.setValue(LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION, String.valueOf(logFileToParquetCompressionRatio));
      return this;
    }

    public Builder orcMaxFileSize(long maxFileSize) {
      storageConfig.setValue(ORC_FILE_MAX_SIZE, String.valueOf(maxFileSize));
      return this;
    }

    public Builder orcStripeSize(int orcStripeSize) {
      storageConfig.setValue(ORC_STRIPE_SIZE, String.valueOf(orcStripeSize));
      return this;
    }

    public Builder orcBlockSize(int orcBlockSize) {
      storageConfig.setValue(ORC_BLOCK_SIZE, String.valueOf(orcBlockSize));
      return this;
    }

    public Builder orcCompressionCodec(String orcCompressionCodec) {
      storageConfig.setValue(ORC_COMPRESSION_CODEC_NAME, orcCompressionCodec);
      return this;
    }

    public Builder withAvroWriteSupport(String avroWriteSupportClassName) {
      storageConfig.setValue(HOODIE_AVRO_WRITE_SUPPORT_CLASS, avroWriteSupportClassName);
      return this;
    }

    public HoodieStorageConfig build() {
      storageConfig.setDefaults(HoodieStorageConfig.class.getName());
      return storageConfig;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy