All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.config.HoodieClusteringConfig Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.config;

import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.EnumDescription;
import org.apache.hudi.common.config.EnumFieldDescription;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;

/**
 * Clustering specific configs.
 */
@ConfigClassProperty(name = "Clustering Configs",
    groupName = ConfigGroups.Names.WRITE_CLIENT,
    description = "Configurations that control the clustering table service in hudi, "
        + "which optimizes the storage layout for better query performance by sorting and sizing data files.")
public class HoodieClusteringConfig extends HoodieConfig {

  // Any strategy specific params can be saved with this prefix
  public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
  public static final String SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
      "org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy";
  public static final String FLINK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
      "org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy";
  public static final String FLINK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY =
      "org.apache.hudi.client.clustering.plan.strategy.FlinkConsistentBucketClusteringPlanStrategy";
  public static final String SPARK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY =
      "org.apache.hudi.client.clustering.plan.strategy.SparkConsistentBucketClusteringPlanStrategy";
  public static final String JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY =
      "org.apache.hudi.client.clustering.plan.strategy.JavaSizeBasedClusteringPlanStrategy";
  public static final String SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY =
      "org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy";
  public static final String SPARK_CONSISTENT_BUCKET_EXECUTION_STRATEGY =
      "org.apache.hudi.client.clustering.run.strategy.SparkConsistentBucketClusteringExecutionStrategy";
  public static final String JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY =
      "org.apache.hudi.client.clustering.run.strategy.JavaSortAndSizeExecutionStrategy";
  public static final String PLAN_PARTITION_FILTER_MODE =
      "hoodie.clustering.plan.partition.filter.mode";

  // Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix
  private static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize.";

  public static final ConfigProperty DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions")
      .defaultValue("2")
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Number of partitions to list to create ClusteringPlan");

  public static final ConfigProperty PARTITION_FILTER_BEGIN_PARTITION = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "cluster.begin.partition")
      .noDefaultValue()
      .markAdvanced()
      .sinceVersion("0.11.0")
      .withDocumentation("Begin partition used to filter partition (inclusive), only effective when the filter mode '"
          + PLAN_PARTITION_FILTER_MODE + "' is " + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());

  public static final ConfigProperty PARTITION_FILTER_END_PARTITION = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "cluster.end.partition")
      .noDefaultValue()
      .markAdvanced()
      .sinceVersion("0.11.0")
      .withDocumentation("End partition used to filter partition (inclusive), only effective when the filter mode '"
          + PLAN_PARTITION_FILTER_MODE + "' is " + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());

  public static final ConfigProperty PLAN_STRATEGY_SMALL_FILE_LIMIT = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit")
      .defaultValue(String.valueOf(300 * 1024 * 1024L))
      .sinceVersion("0.7.0")
      .withDocumentation("Files smaller than the size in bytes specified here are candidates for clustering");

  public static final ConfigProperty PARTITION_REGEX_PATTERN = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "partition.regex.pattern")
      .noDefaultValue()
      .markAdvanced()
      .sinceVersion("0.11.0")
      .withDocumentation("Filter clustering partitions that matched regex pattern");

  public static final ConfigProperty PARTITION_SELECTED = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "partition.selected")
      .noDefaultValue()
      .markAdvanced()
      .sinceVersion("0.11.0")
      .withDocumentation("Partitions to run clustering");

  public static final ConfigProperty PLAN_STRATEGY_CLASS_NAME = ConfigProperty
      .key("hoodie.clustering.plan.strategy.class")
      .defaultValue(SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY)
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan "
          + "i.e select what file groups are being clustered. Default strategy, looks at the clustering small file size limit (determined by "
          + PLAN_STRATEGY_SMALL_FILE_LIMIT.key() + ") to pick the small file slices within partitions for clustering.");

  public static final ConfigProperty EXECUTION_STRATEGY_CLASS_NAME = ConfigProperty
      .key("hoodie.clustering.execution.strategy.class")
      .defaultValue(SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY)
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the "
          + " clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while "
          + " meeting the configured target file sizes.");

  public static final ConfigProperty INLINE_CLUSTERING = ConfigProperty
      .key("hoodie.clustering.inline")
      .defaultValue("false")
      .sinceVersion("0.7.0")
      .withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete")
      .withAlternatives("hoodie.datasource.clustering.inline.enable");

  public static final ConfigProperty INLINE_CLUSTERING_MAX_COMMITS = ConfigProperty
      .key("hoodie.clustering.inline.max.commits")
      .defaultValue("4")
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Config to control frequency of clustering planning");

  public static final ConfigProperty ASYNC_CLUSTERING_MAX_COMMITS = ConfigProperty
      .key("hoodie.clustering.async.max.commits")
      .defaultValue("4")
      .markAdvanced()
      .sinceVersion("0.9.0")
      .withDocumentation("Config to control frequency of async clustering");

  public static final ConfigProperty CLUSTERING_MAX_PARALLELISM = ConfigProperty
      .key("hoodie.clustering.max.parallelism")
      .defaultValue(15)
      .markAdvanced()
      .sinceVersion("0.14.0")
      .withDocumentation("Maximum number of parallelism jobs submitted in clustering operation. "
          + "If the resource is sufficient(Like Spark engine has enough idle executors), increasing this "
          + "value will let the clustering job run faster, while it will give additional pressure to the "
          + "execution engines to manage more concurrent running jobs.");

  public static final ConfigProperty CLUSTERING_GROUP_READ_PARALLELISM = ConfigProperty
      .key("hoodie.clustering.group.read.parallelism")
      .defaultValue(20)
      .markAdvanced()
      .sinceVersion("1.0.0")
      .withDocumentation("Maximum number of parallelism when Spark read records from clustering group.");

  public static final ConfigProperty PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.skipfromlatest.partitions")
      .defaultValue("0")
      .markAdvanced()
      .sinceVersion("0.9.0")
      .withDocumentation("Number of partitions to skip from latest when choosing partitions to create ClusteringPlan");

  public static final ConfigProperty PLAN_PARTITION_FILTER_MODE_NAME = ConfigProperty
      .key(PLAN_PARTITION_FILTER_MODE)
      .defaultValue(ClusteringPlanPartitionFilterMode.NONE)
      .markAdvanced()
      .sinceVersion("0.11.0")
      .withDocumentation(ClusteringPlanPartitionFilterMode.class);

  public static final ConfigProperty PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.bytes.per.group")
      .defaultValue(String.valueOf(2 * 1024 * 1024 * 1024L))
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Each clustering operation can create multiple output file groups. Total amount of data processed by clustering operation"
          + " is defined by below two properties (CLUSTERING_MAX_BYTES_PER_GROUP * CLUSTERING_MAX_NUM_GROUPS)."
          + " Max amount of data to be included in one group");

  public static final ConfigProperty PLAN_STRATEGY_MAX_GROUPS = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.num.groups")
      .defaultValue("30")
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism");

  public static final ConfigProperty PLAN_STRATEGY_TARGET_FILE_MAX_BYTES = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "target.file.max.bytes")
      .defaultValue(String.valueOf(1024 * 1024 * 1024L))
      .sinceVersion("0.7.0")
      .withDocumentation("Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups");

  public static final ConfigProperty PLAN_STRATEGY_SINGLE_GROUP_CLUSTERING_ENABLED = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "single.group.clustering.enabled")
      .defaultValue(true)
      .markAdvanced()
      .sinceVersion("0.14.0")
      .withDocumentation("Whether to generate clustering plan when there is only one file group involved, by default true");

  public static final ConfigProperty PLAN_STRATEGY_SORT_COLUMNS = ConfigProperty
      .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "sort.columns")
      .noDefaultValue()
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Columns to sort the data by when clustering");

  public static final ConfigProperty UPDATES_STRATEGY = ConfigProperty
      .key("hoodie.clustering.updates.strategy")
      .defaultValue("org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy")
      .markAdvanced()
      .sinceVersion("0.7.0")
      .withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering."
          + " Default strategy just rejects the update");

  public static final ConfigProperty SCHEDULE_INLINE_CLUSTERING = ConfigProperty
      .key("hoodie.clustering.schedule.inline")
      .defaultValue("false")
      .markAdvanced()
      .withDocumentation("When set to true, clustering service will be attempted for inline scheduling after each write. Users have to ensure "
          + "they have a separate job to run async clustering(execution) for the one scheduled by this writer. Users can choose to set both "
          + "`hoodie.clustering.inline` and `hoodie.clustering.schedule.inline` to false and have both scheduling and execution triggered by any async process, on which "
          + "case `hoodie.clustering.async.enabled` is expected to be set to true. But if `hoodie.clustering.inline` is set to false, and `hoodie.clustering.schedule.inline` "
          + "is set to true, regular writers will schedule clustering inline, but users are expected to trigger async job for execution. If `hoodie.clustering.inline` is set "
          + "to true, regular writers will do both scheduling and execution inline for clustering");

  public static final ConfigProperty ASYNC_CLUSTERING_ENABLE = ConfigProperty
      .key("hoodie.clustering.async.enabled")
      .defaultValue("false")
      .sinceVersion("0.7.0")
      .withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.")
      .withAlternatives("hoodie.datasource.clustering.async.enable");

  /**
   * @deprecated this setting has no effect. Please refer to clustering configuration, as well as
   * {@link #LAYOUT_OPTIMIZE_STRATEGY} config to enable advanced record layout optimization strategies
   */
  public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty
      .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable")
      .defaultValue(false)
      .markAdvanced()
      .sinceVersion("0.10.0")
      .deprecatedAfter("0.11.0")
      .withDocumentation("This setting has no effect. Please refer to clustering configuration, as well as "
          + "LAYOUT_OPTIMIZE_STRATEGY config to enable advanced record layout optimization strategies");

  /**
   * Determines ordering strategy in for records layout optimization.
   * Currently, following strategies are supported
   * 
    *
  • Linear: simply orders records lexicographically
  • *
  • Z-order: orders records along Z-order spatial-curve
  • *
  • Hilbert: orders records along Hilbert's spatial-curve
  • *
* * NOTE: "z-order", "hilbert" strategies may consume considerably more compute, than "linear". * Make sure to perform small-scale local testing for your dataset before applying globally. */ public static final ConfigProperty LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") .defaultValue(LayoutOptimizationStrategy.LINEAR.name()) .markAdvanced() .sinceVersion("0.10.0") .withDocumentation(LayoutOptimizationStrategy.class); /** * NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_STRATEGY} value is set to * either "z-order" or "hilbert" (ie leveraging space-filling curves) * * Currently, two methods to order records along the curve are supported "build" and "sample": * *
    *
  • Direct: entails that spatial curve will be built in full, "filling in" all of the individual * points corresponding to each individual record
  • *
  • Sample: leverages boundary-base interleaved index method (described in more details in * Amazon DynamoDB blog [1])
  • *
* * NOTE: Boundary-based interleaved Index method has better generalization, * but is slower than direct method. * * Please refer to RFC-28 for specific elaboration on both flows. * * [1] https://aws.amazon.com/cn/blogs/database/tag/z-order/ */ public static final ConfigProperty LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.build.method") .defaultValue(SpatialCurveCompositionStrategyType.DIRECT.name()) .markAdvanced() .sinceVersion("0.10.0") .withDocumentation(SpatialCurveCompositionStrategyType.class); /** * NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD} value * is set to "sample" * * Determines target sample size used by the Boundary-based Interleaved Index method. * Larger sample size entails better layout optimization outcomes, at the expense of higher memory * footprint. */ public static final ConfigProperty LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.size") .defaultValue("200000") .markAdvanced() .sinceVersion("0.10.0") .withDocumentation("Determines target sample size used by the Boundary-based Interleaved Index method " + "of building space-filling curve. Larger sample size entails better layout optimization outcomes, " + "at the expense of higher memory footprint."); /** * @deprecated this setting has no effect */ public static final ConfigProperty LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable") .defaultValue(true) .markAdvanced() .sinceVersion("0.10.0") .deprecatedAfter("0.11.0") .withDocumentation("Enable data skipping by collecting statistics once layout optimization is complete."); public static final ConfigProperty ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT = ConfigProperty .key("hoodie.clustering.rollback.pending.replacecommit.on.conflict") .defaultValue(false) .markAdvanced() .sinceVersion("0.10.0") .withDocumentation("If updates are allowed to file groups pending clustering, then set this config to rollback failed or pending clustering instants. " + "Pending clustering will be rolled back ONLY IF there is conflict between incoming upsert and filegroup to be clustered. " + "Please exercise caution while setting this config, especially when clustering is done very frequently. This could lead to race condition in " + "rare scenarios, for example, when the clustering completes after instants are fetched but before rollback completed."); /** * @deprecated Use {@link #PLAN_STRATEGY_CLASS_NAME} and its methods instead */ @Deprecated public static final String CLUSTERING_PLAN_STRATEGY_CLASS = PLAN_STRATEGY_CLASS_NAME.key(); /** * @deprecated Use {@link #PLAN_STRATEGY_CLASS_NAME} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_PLAN_STRATEGY_CLASS = PLAN_STRATEGY_CLASS_NAME.defaultValue(); /** * @deprecated Use {@link #EXECUTION_STRATEGY_CLASS_NAME} and its methods instead */ @Deprecated public static final String CLUSTERING_EXECUTION_STRATEGY_CLASS = EXECUTION_STRATEGY_CLASS_NAME.key(); /** * @deprecated Use {@link #EXECUTION_STRATEGY_CLASS_NAME} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_EXECUTION_STRATEGY_CLASS = EXECUTION_STRATEGY_CLASS_NAME.defaultValue(); /** * @deprecated Use {@link #INLINE_CLUSTERING} and its methods instead */ @Deprecated public static final String INLINE_CLUSTERING_PROP = INLINE_CLUSTERING.key(); /** * @deprecated Use {@link #INLINE_CLUSTERING} and its methods instead */ @Deprecated private static final String DEFAULT_INLINE_CLUSTERING = INLINE_CLUSTERING.defaultValue(); /** * @deprecated Use {@link #INLINE_CLUSTERING_MAX_COMMITS} and its methods instead */ @Deprecated public static final String INLINE_CLUSTERING_MAX_COMMIT_PROP = INLINE_CLUSTERING_MAX_COMMITS.key(); /** * @deprecated Use {@link #INLINE_CLUSTERING_MAX_COMMITS} and its methods instead */ @Deprecated private static final String DEFAULT_INLINE_CLUSTERING_NUM_COMMITS = INLINE_CLUSTERING_MAX_COMMITS.defaultValue(); /** * @deprecated Use {@link #DAYBASED_LOOKBACK_PARTITIONS} and its methods instead */ @Deprecated public static final String CLUSTERING_TARGET_PARTITIONS = DAYBASED_LOOKBACK_PARTITIONS.key(); /** * @deprecated Use {@link #DAYBASED_LOOKBACK_PARTITIONS} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_TARGET_PARTITIONS = DAYBASED_LOOKBACK_PARTITIONS.defaultValue(); /** * @deprecated Use {@link #PLAN_STRATEGY_SMALL_FILE_LIMIT} and its methods instead */ @Deprecated public static final String CLUSTERING_PLAN_SMALL_FILE_LIMIT = PLAN_STRATEGY_SMALL_FILE_LIMIT.key(); /** * @deprecated Use {@link #PLAN_STRATEGY_SMALL_FILE_LIMIT} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_PLAN_SMALL_FILE_LIMIT = PLAN_STRATEGY_SMALL_FILE_LIMIT.defaultValue(); /** * @deprecated Use {@link #PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP} and its methods instead */ @Deprecated public static final String CLUSTERING_MAX_BYTES_PER_GROUP = PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP.key(); /** * @deprecated Use {@link #PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_MAX_GROUP_SIZE = PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP.defaultValue(); /** * @deprecated Use {@link #PLAN_STRATEGY_MAX_GROUPS} and its methods instead */ @Deprecated public static final String CLUSTERING_MAX_NUM_GROUPS = PLAN_STRATEGY_MAX_GROUPS.key(); /** * @deprecated Use {@link #PLAN_STRATEGY_MAX_GROUPS} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_MAX_NUM_GROUPS = PLAN_STRATEGY_MAX_GROUPS.defaultValue(); /** * @deprecated Use {@link #PLAN_STRATEGY_TARGET_FILE_MAX_BYTES} and its methods instead */ @Deprecated public static final String CLUSTERING_TARGET_FILE_MAX_BYTES = PLAN_STRATEGY_TARGET_FILE_MAX_BYTES.key(); /** * @deprecated Use {@link #PLAN_STRATEGY_TARGET_FILE_MAX_BYTES} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_TARGET_FILE_MAX_BYTES = PLAN_STRATEGY_TARGET_FILE_MAX_BYTES.defaultValue(); /** * @deprecated Use {@link #PLAN_STRATEGY_SORT_COLUMNS} and its methods instead */ @Deprecated public static final String CLUSTERING_SORT_COLUMNS_PROPERTY = PLAN_STRATEGY_SORT_COLUMNS.key(); /** * @deprecated Use {@link #UPDATES_STRATEGY} and its methods instead */ @Deprecated public static final String CLUSTERING_UPDATES_STRATEGY_PROP = UPDATES_STRATEGY.key(); /** * @deprecated Use {@link #UPDATES_STRATEGY} and its methods instead */ @Deprecated public static final String DEFAULT_CLUSTERING_UPDATES_STRATEGY = UPDATES_STRATEGY.defaultValue(); /** * @deprecated Use {@link #ASYNC_CLUSTERING_ENABLE} and its methods instead */ @Deprecated public static final String ASYNC_CLUSTERING_ENABLE_OPT_KEY = ASYNC_CLUSTERING_ENABLE.key(); /** @deprecated Use {@link #ASYNC_CLUSTERING_ENABLE} and its methods instead */ @Deprecated public static final String DEFAULT_ASYNC_CLUSTERING_ENABLE_OPT_VAL = ASYNC_CLUSTERING_ENABLE.defaultValue(); // NOTE: This ctor is required for appropriate deserialization public HoodieClusteringConfig() { super(); } public boolean isAsyncClusteringEnabled() { return getBooleanOrDefault(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE); } public boolean isInlineClusteringEnabled() { return getBooleanOrDefault(HoodieClusteringConfig.INLINE_CLUSTERING); } public static HoodieClusteringConfig from(TypedProperties props) { return HoodieClusteringConfig.newBuilder().fromProperties(props).build(); } public static Builder newBuilder() { return new Builder(); } public static class Builder { private final HoodieClusteringConfig clusteringConfig = new HoodieClusteringConfig(); private EngineType engineType = EngineType.SPARK; public Builder withEngineType(EngineType engineType) { this.engineType = engineType; return this; } public Builder fromFile(File propertiesFile) throws IOException { try (FileReader reader = new FileReader(propertiesFile)) { this.clusteringConfig.getProps().load(reader); return this; } } public Builder withClusteringPlanStrategyClass(String clusteringStrategyClass) { clusteringConfig.setValue(PLAN_STRATEGY_CLASS_NAME, clusteringStrategyClass); return this; } public Builder withSingleGroupClusteringEnabled(Boolean enabled) { clusteringConfig.setValue(PLAN_STRATEGY_SINGLE_GROUP_CLUSTERING_ENABLED, String.valueOf(enabled)); return this; } public Builder withClusteringPlanPartitionFilterMode(ClusteringPlanPartitionFilterMode mode) { clusteringConfig.setValue(PLAN_PARTITION_FILTER_MODE_NAME.key(), mode.toString()); return this; } public Builder withClusteringExecutionStrategyClass(String runClusteringStrategyClass) { clusteringConfig.setValue(EXECUTION_STRATEGY_CLASS_NAME, runClusteringStrategyClass); return this; } public Builder withClusteringTargetPartitions(int clusteringTargetPartitions) { clusteringConfig.setValue(DAYBASED_LOOKBACK_PARTITIONS, String.valueOf(clusteringTargetPartitions)); return this; } public Builder withClusteringPartitionRegexPattern(String pattern) { clusteringConfig.setValue(PARTITION_REGEX_PATTERN, pattern); return this; } public Builder withClusteringPartitionSelected(String partitionSelected) { clusteringConfig.setValue(PARTITION_SELECTED, partitionSelected); return this; } public Builder withClusteringSkipPartitionsFromLatest(int clusteringSkipPartitionsFromLatest) { clusteringConfig.setValue(PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST, String.valueOf(clusteringSkipPartitionsFromLatest)); return this; } public Builder withClusteringPartitionFilterBeginPartition(String begin) { clusteringConfig.setValue(PARTITION_FILTER_BEGIN_PARTITION, begin); return this; } public Builder withClusteringPartitionFilterEndPartition(String end) { clusteringConfig.setValue(PARTITION_FILTER_END_PARTITION, end); return this; } public Builder withClusteringPlanSmallFileLimit(long clusteringSmallFileLimit) { clusteringConfig.setValue(PLAN_STRATEGY_SMALL_FILE_LIMIT, String.valueOf(clusteringSmallFileLimit)); return this; } public Builder withClusteringSortColumns(String sortColumns) { clusteringConfig.setValue(PLAN_STRATEGY_SORT_COLUMNS, sortColumns); return this; } public Builder withClusteringMaxBytesInGroup(long clusteringMaxGroupSize) { clusteringConfig.setValue(PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP, String.valueOf(clusteringMaxGroupSize)); return this; } public Builder withClusteringMaxNumGroups(int maxNumGroups) { clusteringConfig.setValue(PLAN_STRATEGY_MAX_GROUPS, String.valueOf(maxNumGroups)); return this; } public Builder withClusteringTargetFileMaxBytes(long targetFileSize) { clusteringConfig.setValue(PLAN_STRATEGY_TARGET_FILE_MAX_BYTES, String.valueOf(targetFileSize)); return this; } public Builder withInlineClustering(Boolean inlineClustering) { clusteringConfig.setValue(INLINE_CLUSTERING, String.valueOf(inlineClustering)); return this; } public Builder withScheduleInlineClustering(Boolean scheduleInlineClustering) { clusteringConfig.setValue(SCHEDULE_INLINE_CLUSTERING, String.valueOf(scheduleInlineClustering)); return this; } public Builder withInlineClusteringNumCommits(int numCommits) { clusteringConfig.setValue(INLINE_CLUSTERING_MAX_COMMITS, String.valueOf(numCommits)); return this; } public Builder withAsyncClusteringMaxCommits(int numCommits) { clusteringConfig.setValue(ASYNC_CLUSTERING_MAX_COMMITS, String.valueOf(numCommits)); return this; } public Builder fromProperties(Properties props) { // TODO this should cherry-pick only clustering properties this.clusteringConfig.getProps().putAll(props); return this; } public Builder withClusteringUpdatesStrategy(String updatesStrategyClass) { clusteringConfig.setValue(UPDATES_STRATEGY, updatesStrategyClass); return this; } public Builder withAsyncClustering(Boolean asyncClustering) { clusteringConfig.setValue(ASYNC_CLUSTERING_ENABLE, String.valueOf(asyncClustering)); return this; } public Builder withRollbackPendingClustering(Boolean rollbackPendingClustering) { clusteringConfig.setValue(ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT, String.valueOf(rollbackPendingClustering)); return this; } public Builder withDataOptimizeStrategy(String strategy) { clusteringConfig.setValue(LAYOUT_OPTIMIZE_STRATEGY, strategy); return this; } public Builder withDataOptimizeBuildCurveStrategy(String method) { clusteringConfig.setValue(LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD, method); return this; } public Builder withDataOptimizeBuildCurveSampleNumber(int sampleNumber) { clusteringConfig.setValue(LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE, String.valueOf(sampleNumber)); return this; } public HoodieClusteringConfig build() { setDefaults(); validate(); return clusteringConfig; } private void setDefaults() { clusteringConfig.setDefaultValue(PLAN_STRATEGY_CLASS_NAME, getDefaultPlanStrategyClassName(engineType)); clusteringConfig.setDefaultValue(EXECUTION_STRATEGY_CLASS_NAME, getDefaultExecutionStrategyClassName(engineType)); clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName()); } private void validate() { boolean inlineCluster = clusteringConfig.getBoolean(HoodieClusteringConfig.INLINE_CLUSTERING); boolean inlineClusterSchedule = clusteringConfig.getBoolean(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING); ValidationUtils.checkArgument(!(inlineCluster && inlineClusterSchedule), String.format("Either of inline clustering (%s) or " + "schedule inline clustering (%s) can be enabled. Both can't be set to true at the same time. %s,%s", HoodieClusteringConfig.INLINE_CLUSTERING.key(), HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING.key(), inlineCluster, inlineClusterSchedule)); if (isConsistentHashingBucketIndex()) { String planStrategy = clusteringConfig.getString(PLAN_STRATEGY_CLASS_NAME); if (engineType == EngineType.FLINK) { ValidationUtils.checkArgument(planStrategy.equalsIgnoreCase(FLINK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY), "Consistent hashing bucket index only supports clustering plan strategy : " + FLINK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY); } else { ValidationUtils.checkArgument( planStrategy.equalsIgnoreCase(SPARK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY), "Consistent hashing bucket index only supports clustering plan strategy : " + SPARK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY); ValidationUtils.checkArgument( clusteringConfig.getString(EXECUTION_STRATEGY_CLASS_NAME).equals(SPARK_CONSISTENT_BUCKET_EXECUTION_STRATEGY), "Consistent hashing bucket index only supports clustering execution strategy : " + SPARK_CONSISTENT_BUCKET_EXECUTION_STRATEGY); } } } private boolean isConsistentHashingBucketIndex() { return clusteringConfig.contains(HoodieIndexConfig.INDEX_TYPE.key()) && clusteringConfig.contains(HoodieIndexConfig.BUCKET_INDEX_ENGINE_TYPE.key()) && clusteringConfig.getString(HoodieIndexConfig.INDEX_TYPE.key()).equalsIgnoreCase(HoodieIndex.IndexType.BUCKET.name()) && clusteringConfig.getString(HoodieIndexConfig.BUCKET_INDEX_ENGINE_TYPE.key()).equalsIgnoreCase(HoodieIndex.BucketIndexEngineType.CONSISTENT_HASHING.name()); } private String getDefaultPlanStrategyClassName(EngineType engineType) { switch (engineType) { case SPARK: return isConsistentHashingBucketIndex() ? SPARK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY : SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY; case FLINK: return isConsistentHashingBucketIndex() ? FLINK_CONSISTENT_BUCKET_CLUSTERING_PLAN_STRATEGY : FLINK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY; case JAVA: return JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY; default: throw new HoodieNotSupportedException("Unsupported engine " + engineType); } } private String getDefaultExecutionStrategyClassName(EngineType engineType) { switch (engineType) { case SPARK: return isConsistentHashingBucketIndex() ? SPARK_CONSISTENT_BUCKET_EXECUTION_STRATEGY : SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY; case FLINK: case JAVA: return JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY; default: throw new HoodieNotSupportedException("Unsupported engine " + engineType); } } } /** * Type of strategy for building Z-order/Hilbert space-filling curves. */ @EnumDescription("This configuration only has effect if `hoodie.layout.optimize.strategy` is " + "set to either \"z-order\" or \"hilbert\" (i.e. leveraging space-filling curves). This " + "configuration controls the type of a strategy to use for building the space-filling " + "curves, tackling specifically how the Strings are ordered based on the curve. " + "Since we truncate the String to 8 bytes for ordering, there are two issues: (1) it " + "can lead to poor aggregation effect, (2) the truncation of String longer than 8 bytes " + "loses the precision, if the Strings are different but the 8-byte prefix is the same. " + "The boundary-based interleaved index method (\"SAMPLE\") has better generalization, " + "solving the two problems above, but is slower than direct method (\"DIRECT\"). " + "User should benchmark the write and query performance before tweaking this in " + "production, if this is actually a problem. Please refer to RFC-28 for more details.") public enum SpatialCurveCompositionStrategyType { @EnumFieldDescription("This strategy builds the spatial curve in full, filling in all of " + "the individual points corresponding to each individual record, which requires less " + "compute.") DIRECT, @EnumFieldDescription("This strategy leverages boundary-base interleaved index method " + "(described in more details in Amazon DynamoDB blog " + "https://aws.amazon.com/cn/blogs/database/tag/z-order/) and produces a better layout " + "compared to DIRECT strategy. It requires more compute and is slower.") SAMPLE } /** * Layout optimization strategies such as Z-order/Hilbert space-curves, etc */ @EnumDescription("Determines ordering strategy for records layout optimization.") public enum LayoutOptimizationStrategy { @EnumFieldDescription("Orders records lexicographically") LINEAR, @EnumFieldDescription("Orders records along Z-order spatial-curve.") ZORDER, @EnumFieldDescription("Orders records along Hilbert's spatial-curve.") HILBERT } public static LayoutOptimizationStrategy resolveLayoutOptimizationStrategy(String cfgVal) { if (cfgVal.equalsIgnoreCase("z-order")) { return LayoutOptimizationStrategy.ZORDER; } return LayoutOptimizationStrategy.valueOf(cfgVal.toUpperCase()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy