org.apache.hudi.common.config.HoodieMetadataConfig Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.config;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieNotSupportedException;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.Properties;
/**
* Configurations used by the HUDI Metadata Table.
*/
@Immutable
@ConfigClassProperty(name = "Metadata Configs",
groupName = ConfigGroups.Names.WRITE_CLIENT,
description = "Configurations used by the Hudi Metadata Table. "
+ "This table maintains the metadata about a given Hudi table (e.g file listings) "
+ " to avoid overhead of accessing cloud storage, during queries.")
public final class HoodieMetadataConfig extends HoodieConfig {
// Asynchronous cleaning for metadata table is disabled by default
public static final boolean DEFAULT_METADATA_ASYNC_CLEAN = false;
// Full scanning of log files while reading log records is enabled by default for metadata table
public static final boolean DEFAULT_METADATA_ENABLE_FULL_SCAN_LOG_FILES = true;
// Meta fields are not populated by default for metadata table
public static final boolean DEFAULT_METADATA_POPULATE_META_FIELDS = false;
// Default number of commits to retain, without cleaning, on metadata table
public static final int DEFAULT_METADATA_CLEANER_COMMITS_RETAINED = 20;
public static final String METADATA_PREFIX = "hoodie.metadata";
public static final String OPTIMIZED_LOG_BLOCKS_SCAN = ".optimized.log.blocks.scan.enable";
// Enable the internal Metadata Table which saves file listings
public static final ConfigProperty ENABLE = ConfigProperty
.key(METADATA_PREFIX + ".enable")
.defaultValue(true)
.sinceVersion("0.7.0")
.withDocumentation("Enable the internal metadata table which serves table metadata like level file listings");
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = true;
// Enable metrics for internal Metadata Table
public static final ConfigProperty METRICS_ENABLE = ConfigProperty
.key(METADATA_PREFIX + ".metrics.enable")
.defaultValue(false)
.markAdvanced()
.sinceVersion("0.7.0")
.withDocumentation("Enable publishing of metrics around metadata table.");
// Async index
public static final ConfigProperty ASYNC_INDEX_ENABLE = ConfigProperty
.key(METADATA_PREFIX + ".index.async")
.defaultValue(false)
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("Enable asynchronous indexing of metadata table.");
// Maximum delta commits before compaction occurs
public static final ConfigProperty COMPACT_NUM_DELTA_COMMITS = ConfigProperty
.key(METADATA_PREFIX + ".compact.max.delta.commits")
.defaultValue(10)
.markAdvanced()
.sinceVersion("0.7.0")
.withDocumentation("Controls how often the metadata table is compacted.");
public static final ConfigProperty ENABLE_LOG_COMPACTION_ON_METADATA_TABLE = ConfigProperty
.key(METADATA_PREFIX + ".log.compaction.enable")
.defaultValue("false")
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("This configs enables logcompaction for the metadata table.");
// Log blocks threshold, after a file slice crosses this threshold log compact operation is scheduled.
public static final ConfigProperty LOG_COMPACT_BLOCKS_THRESHOLD = ConfigProperty
.key(METADATA_PREFIX + ".log.compaction.blocks.threshold")
.defaultValue(5)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Controls the criteria to log compacted files groups in metadata table.");
// Regex to filter out matching directories during bootstrap
public static final ConfigProperty DIR_FILTER_REGEX = ConfigProperty
.key(METADATA_PREFIX + ".dir.filter.regex")
.defaultValue("")
.markAdvanced()
.sinceVersion("0.7.0")
.withDocumentation("Directories matching this regex, will be filtered out when initializing metadata table from lake storage for the first time.");
public static final ConfigProperty FILE_LISTING_PARALLELISM_VALUE = ConfigProperty
.key("hoodie.file.listing.parallelism")
.defaultValue(200)
.markAdvanced()
.sinceVersion("0.7.0")
.withDocumentation("Parallelism to use, when listing the table on lake storage.");
public static final ConfigProperty ENABLE_METADATA_INDEX_BLOOM_FILTER = ConfigProperty
.key(METADATA_PREFIX + ".index.bloom.filter.enable")
.defaultValue(false)
.sinceVersion("0.11.0")
.withDocumentation("Enable indexing bloom filters of user data files under metadata table. When enabled, "
+ "metadata table will have a partition to store the bloom filter index and will be "
+ "used during the index lookups.");
public static final ConfigProperty METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT = ConfigProperty
.key(METADATA_PREFIX + ".index.bloom.filter.file.group.count")
.defaultValue(4)
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("Metadata bloom filter index partition file group count. This controls the size of the base and "
+ "log files and read parallelism in the bloom filter index partition. The recommendation is to size the "
+ "file group count such that the base files are under 1GB.");
public static final ConfigProperty BLOOM_FILTER_INDEX_PARALLELISM = ConfigProperty
.key(METADATA_PREFIX + ".index.bloom.filter.parallelism")
.defaultValue(200)
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("Parallelism to use for generating bloom filter index in metadata table.");
public static final ConfigProperty ENABLE_METADATA_INDEX_COLUMN_STATS = ConfigProperty
.key(METADATA_PREFIX + ".index.column.stats.enable")
.defaultValue(false)
.sinceVersion("0.11.0")
.withDocumentation("Enable indexing column ranges of user data files under metadata table key lookups. When "
+ "enabled, metadata table will have a partition to store the column ranges and will be "
+ "used for pruning files during the index lookups.");
public static final ConfigProperty METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT = ConfigProperty
.key(METADATA_PREFIX + ".index.column.stats.file.group.count")
.defaultValue(2)
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("Metadata column stats partition file group count. This controls the size of the base and "
+ "log files and read parallelism in the column stats index partition. The recommendation is to size the "
+ "file group count such that the base files are under 1GB.");
public static final ConfigProperty COLUMN_STATS_INDEX_PARALLELISM = ConfigProperty
.key(METADATA_PREFIX + ".index.column.stats.parallelism")
.defaultValue(200)
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("Parallelism to use, when generating column stats index.");
public static final ConfigProperty COLUMN_STATS_INDEX_FOR_COLUMNS = ConfigProperty
.key(METADATA_PREFIX + ".index.column.stats.column.list")
.noDefaultValue()
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("Comma-separated list of columns for which column stats index will be built. If not set, all columns will be indexed");
public static final ConfigProperty COLUMN_STATS_INDEX_MAX_COLUMNS = ConfigProperty
.key(METADATA_PREFIX + ".index.column.stats.max.columns.to.index")
.defaultValue(32)
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("Maximum number of columns to generate column stats for. If the config `"
+ COLUMN_STATS_INDEX_FOR_COLUMNS.key() + "` is set, this config will be ignored. "
+ "If the config `" + COLUMN_STATS_INDEX_FOR_COLUMNS.key() + "` is not set, "
+ "the column stats of the first `n` columns (`n` defined by this config) in the "
+ "table schema are generated.");
public static final String COLUMN_STATS_INDEX_PROCESSING_MODE_IN_MEMORY = "in-memory";
public static final String COLUMN_STATS_INDEX_PROCESSING_MODE_ENGINE = "engine";
public static final ConfigProperty COLUMN_STATS_INDEX_PROCESSING_MODE_OVERRIDE = ConfigProperty
.key(METADATA_PREFIX + ".index.column.stats.processing.mode.override")
.noDefaultValue()
.withValidValues(COLUMN_STATS_INDEX_PROCESSING_MODE_IN_MEMORY, COLUMN_STATS_INDEX_PROCESSING_MODE_ENGINE)
.markAdvanced()
.sinceVersion("0.12.0")
.withDocumentation("By default Column Stats Index is automatically determining whether it should be read and processed either"
+ "'in-memory' (w/in executing process) or using Spark (on a cluster), based on some factors like the size of the Index "
+ "and how many columns are read. This config allows to override this behavior.");
public static final ConfigProperty COLUMN_STATS_INDEX_IN_MEMORY_PROJECTION_THRESHOLD = ConfigProperty
.key(METADATA_PREFIX + ".index.column.stats.inMemory.projection.threshold")
.defaultValue(100000)
.markAdvanced()
.sinceVersion("0.12.0")
.withDocumentation("When reading Column Stats Index, if the size of the expected resulting projection is below the in-memory"
+ " threshold (counted by the # of rows), it will be attempted to be loaded \"in-memory\" (ie not using the execution engine"
+ " like Spark, Flink, etc). If the value is above the threshold execution engine will be used to compose the projection.");
public static final ConfigProperty BLOOM_FILTER_INDEX_FOR_COLUMNS = ConfigProperty
.key(METADATA_PREFIX + ".index.bloom.filter.column.list")
.noDefaultValue()
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("Comma-separated list of columns for which bloom filter index will be built. If not set, only record key will be indexed.");
public static final ConfigProperty METADATA_INDEX_CHECK_TIMEOUT_SECONDS = ConfigProperty
.key(METADATA_PREFIX + ".index.check.timeout.seconds")
.defaultValue(900)
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("After the async indexer has finished indexing upto the base instant, it will ensure that all inflight writers "
+ "reliably write index updates as well. If this timeout expires, then the indexer will abort itself safely.");
public static final ConfigProperty IGNORE_SPURIOUS_DELETES = ConfigProperty
.key("_" + METADATA_PREFIX + ".ignore.spurious.deletes")
.defaultValue(true)
.markAdvanced()
.sinceVersion("0.10.0")
.withDocumentation("There are cases when extra files are requested to be deleted from "
+ "metadata table which are never added before. This config determines how to handle "
+ "such spurious deletes");
public static final ConfigProperty ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN = ConfigProperty
.key(METADATA_PREFIX + OPTIMIZED_LOG_BLOCKS_SCAN)
.defaultValue(false)
.markAdvanced()
.sinceVersion("0.13.0")
.withDocumentation("Optimized log blocks scanner that addresses all the multi-writer use-cases while appending to log files. "
+ "It also differentiates original blocks written by ingestion writers and compacted blocks written by log compaction.");
public static final ConfigProperty METADATA_MAX_NUM_DELTACOMMITS_WHEN_PENDING = ConfigProperty
.key(METADATA_PREFIX + ".max.deltacommits.when_pending")
.defaultValue(1000)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("When there is a pending instant in data table, this config limits the allowed number of deltacommits in metadata table to "
+ "prevent the metadata table's timeline from growing unboundedly as compaction won't be triggered due to the pending data table instant.");
public static final ConfigProperty RECORD_INDEX_ENABLE_PROP = ConfigProperty
.key(METADATA_PREFIX + ".record.index.enable")
.defaultValue(false)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Create the HUDI Record Index within the Metadata Table");
public static final ConfigProperty RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP = ConfigProperty
.key(METADATA_PREFIX + ".record.index.min.filegroup.count")
.defaultValue(10)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Minimum number of file groups to use for Record Index.");
public static final ConfigProperty RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP = ConfigProperty
.key(METADATA_PREFIX + ".record.index.max.filegroup.count")
.defaultValue(10000)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Maximum number of file groups to use for Record Index.");
public static final ConfigProperty RECORD_INDEX_MAX_FILE_GROUP_SIZE_BYTES_PROP = ConfigProperty
.key(METADATA_PREFIX + ".record.index.max.filegroup.size")
.defaultValue(1024 * 1024 * 1024)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Maximum size in bytes of a single file group. Large file group takes longer to compact.");
public static final ConfigProperty RECORD_INDEX_GROWTH_FACTOR_PROP = ConfigProperty
.key(METADATA_PREFIX + ".record.index.growth.factor")
.defaultValue(2.0f)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("The current number of records are multiplied by this number when estimating the number of "
+ "file groups to create automatically. This helps account for growth in the number of records in the dataset.");
public static final ConfigProperty RECORD_INDEX_MAX_PARALLELISM = ConfigProperty
.key(METADATA_PREFIX + ".max.init.parallelism")
.defaultValue(100000)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Maximum parallelism to use when initializing Record Index.");
public static final ConfigProperty MAX_READER_MEMORY_PROP = ConfigProperty
.key(METADATA_PREFIX + ".max.reader.memory")
.defaultValue(1024 * 1024 * 1024L)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Max memory to use for the reader to read from metadata");
public static final ConfigProperty MAX_READER_BUFFER_SIZE_PROP = ConfigProperty
.key(METADATA_PREFIX + ".max.reader.buffer.size")
.defaultValue(10 * 1024 * 1024)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Max memory to use for the reader buffer while merging log blocks");
public static final ConfigProperty SPILLABLE_MAP_DIR_PROP = ConfigProperty
.key(METADATA_PREFIX + ".spillable.map.path")
.noDefaultValue()
.withInferFunction(cfg -> Option.of(cfg.getStringOrDefault(FileSystemViewStorageConfig.SPILLABLE_DIR)))
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Path on local storage to use, when keys read from metadata are held in a spillable map.");
public static final ConfigProperty MAX_LOG_FILE_SIZE_BYTES_PROP = ConfigProperty
.key(METADATA_PREFIX + ".max.logfile.size")
.defaultValue(2 * 1024 * 1024 * 1024L) // 2GB
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Maximum size in bytes of a single log file. Larger log files can contain larger log blocks "
+ "thereby reducing the number of blocks to search for keys");
public static final ConfigProperty AUTO_INITIALIZE = ConfigProperty
.key(METADATA_PREFIX + ".auto.initialize")
.defaultValue(true)
.sinceVersion("0.14.0")
.markAdvanced()
.withDocumentation("Initializes the metadata table by reading from the file system when the table is first created. Enabled by default. "
+ "Warning: This should only be disabled when manually constructing the metadata table outside of typical Hudi writer flows.");
public static final ConfigProperty EXPRESSION_INDEX_ENABLE_PROP = ConfigProperty
.key(METADATA_PREFIX + ".index.expression.enable")
.defaultValue(false)
.sinceVersion("1.0.0")
.withDocumentation("Enable expression index within the metadata table. "
+ " When this configuration property is enabled (`true`), the Hudi writer automatically "
+ " keeps all expression indexes consistent with the data table. "
+ " When disabled (`false`), all expression indexes are deleted. "
+ " Note that individual expression index can only be created through a `CREATE INDEX` "
+ " and deleted through a `DROP INDEX` statement in Spark SQL.");
public static final ConfigProperty EXPRESSION_INDEX_FILE_GROUP_COUNT = ConfigProperty
.key(METADATA_PREFIX + ".index.expression.file.group.count")
.defaultValue(2)
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("Metadata expression index partition file group count.");
public static final ConfigProperty EXPRESSION_INDEX_PARALLELISM = ConfigProperty
.key(METADATA_PREFIX + ".index.expression.parallelism")
.defaultValue(200)
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("Parallelism to use, when generating expression index.");
public static final ConfigProperty ENABLE_METADATA_INDEX_PARTITION_STATS = ConfigProperty
.key(METADATA_PREFIX + ".index.partition.stats.enable")
.defaultValue(false)
.sinceVersion("1.0.0")
.withDocumentation("Enable aggregating stats for each column at the storage partition level.");
public static final ConfigProperty METADATA_INDEX_PARTITION_STATS_FILE_GROUP_COUNT = ConfigProperty
.key(METADATA_PREFIX + ".index.partition.stats.file.group.count")
.defaultValue(1)
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("Metadata partition stats file group count. This controls the size of the base and "
+ "log files and read parallelism in the partition stats index.");
public static final ConfigProperty PARTITION_STATS_INDEX_PARALLELISM = ConfigProperty
.key(METADATA_PREFIX + ".index.partition.stats.parallelism")
.defaultValue(200)
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("Parallelism to use, when generating partition stats index.");
public static final ConfigProperty SECONDARY_INDEX_ENABLE_PROP = ConfigProperty
.key(METADATA_PREFIX + ".index.secondary.enable")
.defaultValue(true)
.sinceVersion("1.0.0")
.withDocumentation("Enable secondary index within the metadata table. "
+ " When this configuration property is enabled (`true`), the Hudi writer automatically "
+ " keeps all secondary indexes consistent with the data table. "
+ " When disabled (`false`), all secondary indexes are deleted. "
+ " Note that individual secondary index can only be created through a `CREATE INDEX` "
+ " and deleted through a `DROP INDEX` statement in Spark SQL. ");
public static final ConfigProperty SECONDARY_INDEX_PARALLELISM = ConfigProperty
.key(METADATA_PREFIX + ".index.secondary.parallelism")
.defaultValue(200)
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("Parallelism to use, when generating secondary index.");
public long getMaxLogFileSize() {
return getLong(MAX_LOG_FILE_SIZE_BYTES_PROP);
}
private HoodieMetadataConfig() {
super();
}
public static HoodieMetadataConfig.Builder newBuilder() {
return new Builder();
}
public int getFileListingParallelism() {
return Math.max(getInt(HoodieMetadataConfig.FILE_LISTING_PARALLELISM_VALUE), 1);
}
public boolean isEnabled() {
return getBoolean(ENABLE);
}
public boolean isBloomFilterIndexEnabled() {
return getBooleanOrDefault(ENABLE_METADATA_INDEX_BLOOM_FILTER);
}
public boolean isColumnStatsIndexEnabled() {
return getBooleanOrDefault(ENABLE_METADATA_INDEX_COLUMN_STATS);
}
public boolean isRecordIndexEnabled() {
return isEnabled() && getBooleanOrDefault(RECORD_INDEX_ENABLE_PROP);
}
public List getColumnsEnabledForColumnStatsIndex() {
return StringUtils.split(getString(COLUMN_STATS_INDEX_FOR_COLUMNS), CONFIG_VALUES_DELIMITER);
}
public Integer maxColumnsToIndexForColStats() {
return getIntOrDefault(COLUMN_STATS_INDEX_MAX_COLUMNS);
}
public String getColumnStatsIndexProcessingModeOverride() {
return getString(COLUMN_STATS_INDEX_PROCESSING_MODE_OVERRIDE);
}
public Integer getColumnStatsIndexInMemoryProjectionThreshold() {
return getIntOrDefault(COLUMN_STATS_INDEX_IN_MEMORY_PROJECTION_THRESHOLD);
}
public List getColumnsEnabledForBloomFilterIndex() {
return StringUtils.split(getString(BLOOM_FILTER_INDEX_FOR_COLUMNS), CONFIG_VALUES_DELIMITER);
}
public int getBloomFilterIndexFileGroupCount() {
return getIntOrDefault(METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT);
}
public int getColumnStatsIndexFileGroupCount() {
return getIntOrDefault(METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT);
}
public int getBloomFilterIndexParallelism() {
return getIntOrDefault(BLOOM_FILTER_INDEX_PARALLELISM);
}
public int getColumnStatsIndexParallelism() {
return getIntOrDefault(COLUMN_STATS_INDEX_PARALLELISM);
}
public int getIndexingCheckTimeoutSeconds() {
return getIntOrDefault(METADATA_INDEX_CHECK_TIMEOUT_SECONDS);
}
public boolean isMetricsEnabled() {
return getBoolean(METRICS_ENABLE);
}
public String getDirectoryFilterRegex() {
return getString(DIR_FILTER_REGEX);
}
public boolean shouldIgnoreSpuriousDeletes() {
return getBoolean(IGNORE_SPURIOUS_DELETES);
}
public boolean isOptimizedLogBlocksScanEnabled() {
return getBoolean(ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN);
}
public int getMaxNumDeltacommitsWhenPending() {
return getIntOrDefault(METADATA_MAX_NUM_DELTACOMMITS_WHEN_PENDING);
}
public int getRecordIndexMinFileGroupCount() {
return getInt(RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP);
}
public int getRecordIndexMaxFileGroupCount() {
return getInt(RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP);
}
public float getRecordIndexGrowthFactor() {
return getFloat(RECORD_INDEX_GROWTH_FACTOR_PROP);
}
public int getRecordIndexMaxFileGroupSizeBytes() {
return getInt(RECORD_INDEX_MAX_FILE_GROUP_SIZE_BYTES_PROP);
}
public String getSplliableMapDir() {
return getString(SPILLABLE_MAP_DIR_PROP);
}
public long getMaxReaderMemory() {
return getLong(MAX_READER_MEMORY_PROP);
}
public int getMaxReaderBufferSize() {
return getInt(MAX_READER_BUFFER_SIZE_PROP);
}
public int getRecordIndexMaxParallelism() {
return getInt(RECORD_INDEX_MAX_PARALLELISM);
}
public boolean shouldAutoInitialize() {
return getBoolean(AUTO_INITIALIZE);
}
public boolean isExpressionIndexEnabled() {
return getBooleanOrDefault(EXPRESSION_INDEX_ENABLE_PROP);
}
public int getExpressionIndexFileGroupCount() {
return getInt(EXPRESSION_INDEX_FILE_GROUP_COUNT);
}
public int getExpressionIndexParallelism() {
return getInt(EXPRESSION_INDEX_PARALLELISM);
}
public boolean isPartitionStatsIndexEnabled() {
return getBooleanOrDefault(ENABLE_METADATA_INDEX_PARTITION_STATS);
}
public int getPartitionStatsIndexFileGroupCount() {
return getInt(METADATA_INDEX_PARTITION_STATS_FILE_GROUP_COUNT);
}
public int getPartitionStatsIndexParallelism() {
return getInt(PARTITION_STATS_INDEX_PARALLELISM);
}
public boolean isSecondaryIndexEnabled() {
// Secondary index is enabled only iff record index (primary key index) is also enabled
return isRecordIndexEnabled() && getBoolean(SECONDARY_INDEX_ENABLE_PROP);
}
public int getSecondaryIndexParallelism() {
return getInt(SECONDARY_INDEX_PARALLELISM);
}
public static class Builder {
private EngineType engineType = EngineType.SPARK;
private final HoodieMetadataConfig metadataConfig = new HoodieMetadataConfig();
public Builder fromFile(File propertiesFile) throws IOException {
try (FileReader reader = new FileReader(propertiesFile)) {
this.metadataConfig.getProps().load(reader);
return this;
}
}
public Builder fromProperties(Properties props) {
this.metadataConfig.getProps().putAll(props);
return this;
}
public Builder enable(boolean enable) {
metadataConfig.setValue(ENABLE, String.valueOf(enable));
return this;
}
public Builder withMetadataIndexBloomFilter(boolean enable) {
metadataConfig.setValue(ENABLE_METADATA_INDEX_BLOOM_FILTER, String.valueOf(enable));
return this;
}
public Builder withMetadataIndexBloomFilterFileGroups(int fileGroupCount) {
metadataConfig.setValue(METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT, String.valueOf(fileGroupCount));
return this;
}
public Builder withBloomFilterIndexParallelism(int parallelism) {
metadataConfig.setValue(BLOOM_FILTER_INDEX_PARALLELISM, String.valueOf(parallelism));
return this;
}
public Builder withMetadataIndexColumnStats(boolean enable) {
metadataConfig.setValue(ENABLE_METADATA_INDEX_COLUMN_STATS, String.valueOf(enable));
return this;
}
public Builder withMetadataIndexColumnStatsFileGroupCount(int fileGroupCount) {
metadataConfig.setValue(METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT, String.valueOf(fileGroupCount));
return this;
}
public Builder withColumnStatsIndexParallelism(int parallelism) {
metadataConfig.setValue(COLUMN_STATS_INDEX_PARALLELISM, String.valueOf(parallelism));
return this;
}
public Builder withColumnStatsIndexForColumns(String columns) {
metadataConfig.setValue(COLUMN_STATS_INDEX_FOR_COLUMNS, columns);
return this;
}
public Builder withMaxColumnsToIndexForColStats(int maxCols) {
metadataConfig.setValue(COLUMN_STATS_INDEX_MAX_COLUMNS, String.valueOf(maxCols));
return this;
}
public Builder withBloomFilterIndexForColumns(String columns) {
metadataConfig.setValue(BLOOM_FILTER_INDEX_FOR_COLUMNS, columns);
return this;
}
public Builder withIndexingCheckTimeout(int timeoutInSeconds) {
metadataConfig.setValue(METADATA_INDEX_CHECK_TIMEOUT_SECONDS, String.valueOf(timeoutInSeconds));
return this;
}
public Builder enableMetrics(boolean enableMetrics) {
metadataConfig.setValue(METRICS_ENABLE, String.valueOf(enableMetrics));
return this;
}
public Builder withAsyncIndex(boolean asyncIndex) {
metadataConfig.setValue(ASYNC_INDEX_ENABLE, String.valueOf(asyncIndex));
return this;
}
public Builder withMaxNumDeltaCommitsBeforeCompaction(int maxNumDeltaCommitsBeforeCompaction) {
metadataConfig.setValue(COMPACT_NUM_DELTA_COMMITS, String.valueOf(maxNumDeltaCommitsBeforeCompaction));
return this;
}
public Builder withLogCompactionEnabled(boolean enableLogCompaction) {
metadataConfig.setValue(ENABLE_LOG_COMPACTION_ON_METADATA_TABLE, Boolean.toString(enableLogCompaction));
return this;
}
public Builder withLogCompactBlocksThreshold(int logCompactBlocksThreshold) {
metadataConfig.setValue(LOG_COMPACT_BLOCKS_THRESHOLD, Integer.toString(logCompactBlocksThreshold));
return this;
}
public Builder withFileListingParallelism(int parallelism) {
metadataConfig.setValue(FILE_LISTING_PARALLELISM_VALUE, String.valueOf(parallelism));
return this;
}
public Builder withRecordIndexMaxParallelism(int parallelism) {
metadataConfig.setValue(RECORD_INDEX_MAX_PARALLELISM, String.valueOf(parallelism));
return this;
}
public Builder withDirectoryFilterRegex(String regex) {
metadataConfig.setValue(DIR_FILTER_REGEX, regex);
return this;
}
public Builder ignoreSpuriousDeletes(boolean validateMetadataPayloadConsistency) {
metadataConfig.setValue(IGNORE_SPURIOUS_DELETES, String.valueOf(validateMetadataPayloadConsistency));
return this;
}
public Builder withEngineType(EngineType engineType) {
this.engineType = engineType;
return this;
}
public Builder withProperties(Properties properties) {
this.metadataConfig.getProps().putAll(properties);
return this;
}
public Builder withOptimizedLogBlocksScan(boolean enableOptimizedLogBlocksScan) {
metadataConfig.setValue(ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN, String.valueOf(enableOptimizedLogBlocksScan));
return this;
}
public Builder withMaxNumDeltacommitsWhenPending(int maxNumDeltaCommitsWhenPending) {
metadataConfig.setValue(METADATA_MAX_NUM_DELTACOMMITS_WHEN_PENDING, String.valueOf(maxNumDeltaCommitsWhenPending));
return this;
}
public Builder withEnableRecordIndex(boolean enabled) {
metadataConfig.setValue(RECORD_INDEX_ENABLE_PROP, String.valueOf(enabled));
return this;
}
public Builder withRecordIndexFileGroupCount(int minCount, int maxCount) {
metadataConfig.setValue(RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP, String.valueOf(minCount));
metadataConfig.setValue(RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP, String.valueOf(maxCount));
return this;
}
public Builder withRecordIndexGrowthFactor(float factor) {
metadataConfig.setValue(RECORD_INDEX_GROWTH_FACTOR_PROP, String.valueOf(factor));
return this;
}
public Builder withRecordIndexMaxFileGroupSizeBytes(long sizeInBytes) {
metadataConfig.setValue(RECORD_INDEX_MAX_FILE_GROUP_SIZE_BYTES_PROP, String.valueOf(sizeInBytes));
return this;
}
public Builder withSpillableMapDir(String dir) {
metadataConfig.setValue(SPILLABLE_MAP_DIR_PROP, dir);
return this;
}
public Builder withMaxReaderMemory(long mem) {
metadataConfig.setValue(MAX_READER_MEMORY_PROP, String.valueOf(mem));
return this;
}
public Builder withMaxReaderBufferSize(long mem) {
metadataConfig.setValue(MAX_READER_BUFFER_SIZE_PROP, String.valueOf(mem));
return this;
}
public Builder withMaxLogFileSizeBytes(long sizeInBytes) {
metadataConfig.setValue(MAX_LOG_FILE_SIZE_BYTES_PROP, String.valueOf(sizeInBytes));
return this;
}
public Builder withExpressionIndexFileGroupCount(int fileGroupCount) {
metadataConfig.setValue(EXPRESSION_INDEX_FILE_GROUP_COUNT, String.valueOf(fileGroupCount));
return this;
}
public Builder withExpressionIndexParallelism(int parallelism) {
metadataConfig.setValue(EXPRESSION_INDEX_PARALLELISM, String.valueOf(parallelism));
return this;
}
public Builder withMetadataIndexPartitionStats(boolean enable) {
metadataConfig.setValue(ENABLE_METADATA_INDEX_PARTITION_STATS, String.valueOf(enable));
return this;
}
public Builder withMetadataIndexPartitionStatsFileGroupCount(int fileGroupCount) {
metadataConfig.setValue(METADATA_INDEX_PARTITION_STATS_FILE_GROUP_COUNT, String.valueOf(fileGroupCount));
return this;
}
public Builder withPartitionStatsIndexParallelism(int parallelism) {
metadataConfig.setValue(PARTITION_STATS_INDEX_PARALLELISM, String.valueOf(parallelism));
return this;
}
public HoodieMetadataConfig build() {
metadataConfig.setDefaultValue(ENABLE, getDefaultMetadataEnable(engineType));
metadataConfig.setDefaults(HoodieMetadataConfig.class.getName());
return metadataConfig;
}
private boolean getDefaultMetadataEnable(EngineType engineType) {
switch (engineType) {
case FLINK:
case SPARK:
return ENABLE.defaultValue();
case JAVA:
return false;
default:
throw new HoodieNotSupportedException("Unsupported engine " + engineType);
}
}
}
/**
* @deprecated Use {@link #ENABLE} and its methods.
*/
@Deprecated
public static final String METADATA_ENABLE_PROP = ENABLE.key();
/**
* @deprecated Use {@link #ENABLE} and its methods.
*/
@Deprecated
public static final boolean DEFAULT_METADATA_ENABLE = ENABLE.defaultValue();
/**
* @deprecated Use {@link #METRICS_ENABLE} and its methods.
*/
@Deprecated
public static final String METADATA_METRICS_ENABLE_PROP = METRICS_ENABLE.key();
/**
* @deprecated Use {@link #METRICS_ENABLE} and its methods.
*/
@Deprecated
public static final boolean DEFAULT_METADATA_METRICS_ENABLE = METRICS_ENABLE.defaultValue();
/**
* @deprecated Use {@link #COMPACT_NUM_DELTA_COMMITS} and its methods.
*/
@Deprecated
public static final String METADATA_COMPACT_NUM_DELTA_COMMITS_PROP = COMPACT_NUM_DELTA_COMMITS.key();
/**
* @deprecated Use {@link #COMPACT_NUM_DELTA_COMMITS} and its methods.
*/
@Deprecated
public static final int DEFAULT_METADATA_COMPACT_NUM_DELTA_COMMITS = COMPACT_NUM_DELTA_COMMITS.defaultValue();
/**
* @deprecated No longer takes any effect.
*/
@Deprecated
public static final String ENABLE_FALLBACK_PROP = METADATA_PREFIX + ".fallback.enable";
/**
* @deprecated No longer takes any effect.
*/
@Deprecated
public static final String DEFAULT_ENABLE_FALLBACK = "true";
/**
* @deprecated Use {@link #DIR_FILTER_REGEX} and its methods.
*/
@Deprecated
public static final String DIRECTORY_FILTER_REGEX = DIR_FILTER_REGEX.key();
/**
* @deprecated Use {@link #DIR_FILTER_REGEX} and its methods.
*/
@Deprecated
public static final String DEFAULT_DIRECTORY_FILTER_REGEX = DIR_FILTER_REGEX.defaultValue();
/**
* @deprecated Use {@link #FILE_LISTING_PARALLELISM_VALUE} and its methods.
*/
@Deprecated
public static final String FILE_LISTING_PARALLELISM_PROP = FILE_LISTING_PARALLELISM_VALUE.key();
/**
* @deprecated Use {@link #FILE_LISTING_PARALLELISM_VALUE} and its methods.
*/
@Deprecated
public static final int DEFAULT_FILE_LISTING_PARALLELISM = FILE_LISTING_PARALLELISM_VALUE.defaultValue();
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy