org.apache.hudi.DataSourceOptions.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi
import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, QUERY_TYPE_SNAPSHOT_OPT_VAL}
import org.apache.hudi.HoodieConversionUtils.toScalaOption
import org.apache.hudi.common.config._
import org.apache.hudi.common.fs.ConsistencyGuardConfig
import org.apache.hudi.common.model.{HoodieTableType, WriteOperationType}
import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.common.table.timeline.TimelineUtils.HollowCommitHandling
import org.apache.hudi.common.util.ConfigUtils.{DELTA_STREAMER_CONFIG_PREFIX, IS_QUERY_AS_RO_TABLE, STREAMER_CONFIG_PREFIX}
import org.apache.hudi.common.util.Option
import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig}
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, HiveSyncTool}
import org.apache.hudi.keygen.KeyGenUtils.inferKeyGeneratorType
import org.apache.hudi.keygen.constant.KeyGeneratorOptions
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory.{getKeyGeneratorClassNameFromType, inferKeyGeneratorTypeFromWriteConfig}
import org.apache.hudi.keygen.{CustomKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator}
import org.apache.hudi.sync.common.HoodieSyncConfig
import org.apache.hudi.util.JFunction
import org.apache.spark.sql.execution.datasources.{DataSourceUtils => SparkDataSourceUtils}
import org.slf4j.LoggerFactory
import scala.collection.JavaConverters._
import scala.language.implicitConversions
/**
* List of options that can be passed to the Hoodie datasource,
* in addition to the hoodie client configs
*/
/**
* Options supported for reading hoodie tables.
*/
object DataSourceReadOptions {
val QUERY_TYPE_SNAPSHOT_OPT_VAL = "snapshot"
val QUERY_TYPE_READ_OPTIMIZED_OPT_VAL = "read_optimized"
val QUERY_TYPE_INCREMENTAL_OPT_VAL = "incremental"
val QUERY_TYPE: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.query.type")
.defaultValue(QUERY_TYPE_SNAPSHOT_OPT_VAL)
.withAlternatives("hoodie.datasource.view.type")
.withValidValues(QUERY_TYPE_SNAPSHOT_OPT_VAL, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, QUERY_TYPE_INCREMENTAL_OPT_VAL)
.withDocumentation("Whether data needs to be read, in `" + QUERY_TYPE_INCREMENTAL_OPT_VAL + "` mode (new data since an instantTime) " +
"(or) `" + QUERY_TYPE_READ_OPTIMIZED_OPT_VAL + "` mode (obtain latest view, based on base files) (or) `" + QUERY_TYPE_SNAPSHOT_OPT_VAL + "` mode " +
"(obtain latest view, by merging base and (if any) log files)")
val INCREMENTAL_FORMAT_LATEST_STATE_VAL = "latest_state"
val INCREMENTAL_FORMAT_CDC_VAL = "cdc"
val INCREMENTAL_FORMAT: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.query.incremental.format")
.defaultValue(INCREMENTAL_FORMAT_LATEST_STATE_VAL)
.withValidValues(INCREMENTAL_FORMAT_LATEST_STATE_VAL, INCREMENTAL_FORMAT_CDC_VAL)
.markAdvanced()
.sinceVersion("0.13.0")
.withDocumentation("This config is used alone with the 'incremental' query type." +
"When set to 'latest_state', it returns the latest records' values." +
"When set to 'cdc', it returns the cdc data.")
val REALTIME_SKIP_MERGE_OPT_VAL = "skip_merge"
val REALTIME_PAYLOAD_COMBINE_OPT_VAL = "payload_combine"
val REALTIME_MERGE: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.merge.type")
.defaultValue(REALTIME_PAYLOAD_COMBINE_OPT_VAL)
.withValidValues(REALTIME_SKIP_MERGE_OPT_VAL, REALTIME_PAYLOAD_COMBINE_OPT_VAL)
.markAdvanced()
.withDocumentation("For Snapshot query on merge on read table, control whether we invoke the record " +
s"payload implementation to merge (${REALTIME_PAYLOAD_COMBINE_OPT_VAL}) or skip merging altogether" +
s"${REALTIME_SKIP_MERGE_OPT_VAL}")
val READ_PATHS: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.paths")
.noDefaultValue()
.markAdvanced()
.withDocumentation("Comma separated list of file paths to read within a Hudi table.")
val READ_PRE_COMBINE_FIELD = HoodieWriteConfig.PRECOMBINE_FIELD_NAME
val ENABLE_HOODIE_FILE_INDEX: ConfigProperty[Boolean] = ConfigProperty
.key("hoodie.file.index.enable")
.defaultValue(true)
.markAdvanced()
.deprecatedAfter("0.11.0")
.withDocumentation("Enables use of the spark file index implementation for Hudi, "
+ "that speeds up listing of large tables.")
val START_OFFSET: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.streaming.startOffset")
.defaultValue("earliest")
.markAdvanced()
.sinceVersion("0.13.0")
.withDocumentation("Start offset to pull data from hoodie streaming source. allow earliest, latest, and " +
"specified start instant time")
val BEGIN_INSTANTTIME: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.begin.instanttime")
.noDefaultValue()
.withDocumentation("Required when `" + QUERY_TYPE.key() + "` is set to `" + QUERY_TYPE_INCREMENTAL_OPT_VAL + "`. Represents the instant time to start incrementally pulling data from. The instanttime here need not necessarily "
+ "correspond to an instant on the timeline. New data written with an instant_time > BEGIN_INSTANTTIME are fetched out. "
+ "For e.g: ‘20170901080000’ will get all new data written after Sep 1, 2017 08:00AM. Note that if `"
+ HoodieCommonConfig.INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key() + "` set to "
+ HollowCommitHandling.USE_TRANSITION_TIME + ", will use instant's "
+ "`stateTransitionTime` to perform comparison.")
val END_INSTANTTIME: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.end.instanttime")
.noDefaultValue()
.withDocumentation("Used when `" + QUERY_TYPE.key() + "` is set to `" + QUERY_TYPE_INCREMENTAL_OPT_VAL +
"`. Represents the instant time to limit incrementally fetched data to. When not specified latest commit time from " +
"timeline is assumed by default. When specified, new data written with an instant_time <= END_INSTANTTIME are fetched out. " +
"Point in time type queries make more sense with begin and end instant times specified. Note that if `"
+ HoodieCommonConfig.INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key() + "` set to `"
+ HollowCommitHandling.USE_TRANSITION_TIME + "`, will use instant's "
+ "`stateTransitionTime` to perform comparison.")
val INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.schema.use.end.instanttime")
.defaultValue("false")
.markAdvanced()
.withDocumentation("Uses end instant schema when incrementally fetched data to. Default: users latest instant schema.")
val PUSH_DOWN_INCR_FILTERS: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.incr.filters")
.defaultValue("")
.markAdvanced()
.withDocumentation("For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies "
+ "opaque map functions, filters appearing late in the sequence of transformations cannot be automatically "
+ "pushed down. This option allows setting filters directly on Hoodie Source.")
val INCR_PATH_GLOB: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.incr.path.glob")
.defaultValue("")
.markAdvanced()
.withDocumentation("For the use-cases like users only want to incremental pull from certain partitions "
+ "instead of the full table. This option allows using glob pattern to directly filter on path.")
val TIME_TRAVEL_AS_OF_INSTANT: ConfigProperty[String] = HoodieCommonConfig.TIMESTAMP_AS_OF
val ENABLE_DATA_SKIPPING: ConfigProperty[Boolean] = ConfigProperty
.key("hoodie.enable.data.skipping")
.defaultValue(false)
.markAdvanced()
.sinceVersion("0.10.0")
.withDocumentation("Enables data-skipping allowing queries to leverage indexes to reduce the search space by " +
"skipping over files")
val EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH: ConfigProperty[Boolean] =
ConfigProperty.key("hoodie.datasource.read.extract.partition.values.from.path")
.defaultValue(false)
.markAdvanced()
.sinceVersion("0.11.0")
.withDocumentation("When set to true, values for partition columns (partition values) will be extracted" +
" from physical partition path (default Spark behavior). When set to false partition values will be" +
" read from the data file (in Hudi partition columns are persisted by default)." +
" This config is a fallback allowing to preserve existing behavior, and should not be used otherwise.")
val FILE_INDEX_LISTING_MODE_EAGER = "eager"
val FILE_INDEX_LISTING_MODE_LAZY = "lazy"
val FILE_INDEX_LISTING_MODE_OVERRIDE: ConfigProperty[String] =
ConfigProperty.key("hoodie.datasource.read.file.index.listing.mode")
.defaultValue(FILE_INDEX_LISTING_MODE_LAZY)
.withValidValues(FILE_INDEX_LISTING_MODE_LAZY, FILE_INDEX_LISTING_MODE_EAGER)
.markAdvanced()
.sinceVersion("0.13.0")
.withDocumentation("Overrides Hudi's file-index implementation's file listing mode: when set to 'eager'," +
" file-index will list all partition paths and corresponding file slices w/in them eagerly, during initialization," +
" prior to partition-pruning kicking in, meaning that all partitions will be listed including ones that might be " +
" subsequently pruned out; when set to 'lazy', partitions and file-slices w/in them will be listed" +
" lazily (ie when they actually accessed, instead of when file-index is initialized) allowing partition pruning" +
" to occur before that, only listing partitions that has already been pruned. Please note that, this config" +
" is provided purely to allow to fallback to behavior existing prior to 0.13.0 release, and will be deprecated" +
" soon after.")
val FILE_INDEX_LISTING_PARTITION_PATH_PREFIX_ANALYSIS_ENABLED: ConfigProperty[Boolean] =
ConfigProperty.key("hoodie.datasource.read.file.index.listing.partition-path-prefix.analysis.enabled")
.defaultValue(true)
.markAdvanced()
.sinceVersion("0.13.0")
.withDocumentation("Controls whether partition-path prefix analysis is enabled w/in the file-index, allowing" +
" to avoid necessity to recursively list deep folder structures of partitioned tables w/ multiple partition columns," +
" by carefully analyzing provided partition-column predicates and deducing corresponding partition-path prefix from " +
" them (if possible).")
val INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.incr.fallback.fulltablescan.enable")
.defaultValue("false")
.markAdvanced()
.withDocumentation("When doing an incremental query whether we should fall back to full table scans if file does not exist.")
val SCHEMA_EVOLUTION_ENABLED: ConfigProperty[java.lang.Boolean] = HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE
val INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT: ConfigProperty[String] = HoodieCommonConfig.INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT
val CREATE_TIMELINE_RELATION: ConfigProperty[String] =
ConfigProperty.key("hoodie.datasource.read.table.valued.function.timeline.relation")
.defaultValue("false")
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("When this is set, the relation created by DefaultSource is for a view representing" +
" the result set of the table valued function hudi_query_timeline(...)")
val TIMELINE_RELATION_ARG_ARCHIVED_TIMELINE: ConfigProperty[String] =
ConfigProperty.key("hoodie.datasource.read.table.valued.function.timeline.relation.archived")
.defaultValue("false")
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("When this is set, the result set of the table valued function hudi_query_timeline(...)" +
" will include archived timeline")
val CREATE_FILESYSTEM_RELATION: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.read.create.filesystem.relation")
.defaultValue("false")
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("When this is set, the relation created by DefaultSource is for a view representing" +
" the result set of the table valued function hudi_filesystem_view(...)")
val FILESYSTEM_RELATION_ARG_SUBPATH: ConfigProperty[String] =
ConfigProperty.key("hoodie.datasource.read.table.valued.function.filesystem.relation.subpath")
.defaultValue("")
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("A regex under the table's base path to get file system view information")
/** @deprecated Use {@link QUERY_TYPE} and its methods instead */
@Deprecated
val QUERY_TYPE_OPT_KEY = QUERY_TYPE.key()
/** @deprecated Use {@link QUERY_TYPE} and its methods instead */
@Deprecated
val DEFAULT_QUERY_TYPE_OPT_VAL: String = QUERY_TYPE_SNAPSHOT_OPT_VAL
/** @deprecated Use {@link REALTIME_MERGE} and its methods instead */
@Deprecated
val REALTIME_MERGE_OPT_KEY = REALTIME_MERGE.key()
/** @deprecated Use {@link REALTIME_MERGE} and its methods instead */
@Deprecated
val DEFAULT_REALTIME_MERGE_OPT_VAL = REALTIME_PAYLOAD_COMBINE_OPT_VAL
/** @deprecated Use {@link READ_PATHS} and its methods instead */
@Deprecated
val READ_PATHS_OPT_KEY = READ_PATHS.key()
/** @deprecated Use {@link QUERY_TYPE} and its methods instead */
@Deprecated
val VIEW_TYPE_OPT_KEY = "hoodie.datasource.view.type"
@Deprecated
val VIEW_TYPE_READ_OPTIMIZED_OPT_VAL = "read_optimized"
@Deprecated
val VIEW_TYPE_INCREMENTAL_OPT_VAL = "incremental"
@Deprecated
val VIEW_TYPE_REALTIME_OPT_VAL = "realtime"
@Deprecated
val DEFAULT_VIEW_TYPE_OPT_VAL = VIEW_TYPE_READ_OPTIMIZED_OPT_VAL
/** @deprecated Use {@link BEGIN_INSTANTTIME} and its methods instead */
@Deprecated
val BEGIN_INSTANTTIME_OPT_KEY = BEGIN_INSTANTTIME.key()
/** @deprecated Use {@link END_INSTANTTIME} and its methods instead */
@Deprecated
val END_INSTANTTIME_OPT_KEY = END_INSTANTTIME.key()
/** @deprecated Use {@link INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME} and its methods instead */
@Deprecated
val INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_KEY = INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME.key()
/** @deprecated Use {@link INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME} and its methods instead */
@Deprecated
val DEFAULT_INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME_OPT_VAL = INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME.defaultValue()
/** @deprecated Use {@link PUSH_DOWN_INCR_FILTERS} and its methods instead */
@Deprecated
val PUSH_DOWN_INCR_FILTERS_OPT_KEY = PUSH_DOWN_INCR_FILTERS.key()
/** @deprecated Use {@link PUSH_DOWN_INCR_FILTERS} and its methods instead */
@Deprecated
val DEFAULT_PUSH_DOWN_FILTERS_OPT_VAL = PUSH_DOWN_INCR_FILTERS.defaultValue()
/** @deprecated Use {@link INCR_PATH_GLOB} and its methods instead */
@Deprecated
val INCR_PATH_GLOB_OPT_KEY = INCR_PATH_GLOB.key()
/** @deprecated Use {@link INCR_PATH_GLOB} and its methods instead */
@Deprecated
val DEFAULT_INCR_PATH_GLOB_OPT_VAL = INCR_PATH_GLOB.defaultValue()
}
/**
* Options supported for writing hoodie tables.
*/
object DataSourceWriteOptions {
val BULK_INSERT_OPERATION_OPT_VAL = WriteOperationType.BULK_INSERT.value
val INSERT_OPERATION_OPT_VAL = WriteOperationType.INSERT.value
val UPSERT_OPERATION_OPT_VAL = WriteOperationType.UPSERT.value
val DELETE_OPERATION_OPT_VAL = WriteOperationType.DELETE.value
val DELETE_PARTITION_OPERATION_OPT_VAL = WriteOperationType.DELETE_PARTITION.value
val BOOTSTRAP_OPERATION_OPT_VAL = WriteOperationType.BOOTSTRAP.value
val INSERT_OVERWRITE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE.value
val INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE_TABLE.value
val OPERATION: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.operation")
.defaultValue(UPSERT_OPERATION_OPT_VAL)
.withValidValues(
WriteOperationType.INSERT.value,
WriteOperationType.INSERT_PREPPED.value,
WriteOperationType.UPSERT.value,
WriteOperationType.UPSERT_PREPPED.value,
WriteOperationType.BULK_INSERT.value,
WriteOperationType.BULK_INSERT_PREPPED.value,
WriteOperationType.DELETE.value,
WriteOperationType.DELETE_PREPPED.value,
WriteOperationType.BOOTSTRAP.value,
WriteOperationType.INSERT_OVERWRITE.value,
WriteOperationType.CLUSTER.value,
WriteOperationType.DELETE_PARTITION.value,
WriteOperationType.INSERT_OVERWRITE_TABLE.value,
WriteOperationType.COMPACT.value,
WriteOperationType.ALTER_SCHEMA.value
)
.withDocumentation("Whether to do upsert, insert or bulk_insert for the write operation. " +
"Use bulk_insert to load new data into a table, and there on use upsert/insert. " +
"bulk insert uses a disk based write path to scale to load large inputs without need to cache it.")
val COW_TABLE_TYPE_OPT_VAL = HoodieTableType.COPY_ON_WRITE.name
val MOR_TABLE_TYPE_OPT_VAL = HoodieTableType.MERGE_ON_READ.name
val TABLE_TYPE: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.table.type")
.defaultValue(COW_TABLE_TYPE_OPT_VAL)
.withValidValues(COW_TABLE_TYPE_OPT_VAL, MOR_TABLE_TYPE_OPT_VAL)
.withAlternatives("hoodie.datasource.write.storage.type")
.withDocumentation("The table type for the underlying data, for this write. This can’t change between writes.")
/**
* Config key with boolean value that indicates whether record being written during UPDATE or DELETE Spark SQL
* operations are already prepped.
*/
val SPARK_SQL_WRITES_PREPPED_KEY = "_hoodie.spark.sql.writes.prepped";
/**
* May be derive partition path from incoming df if not explicitly set.
*
* @param optParams Parameters to be translated
* @return Parameters after translation
*/
def mayBeDerivePartitionPath(optParams: Map[String, String]): Map[String, String] = {
var translatedOptParams = optParams
// translate the api partitionBy of spark DataFrameWriter to PARTITIONPATH_FIELD
// we should set hoodie's partition path only if its not set by the user.
if (optParams.contains(SparkDataSourceUtils.PARTITIONING_COLUMNS_KEY)
&& !optParams.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) {
val partitionColumns = optParams.get(SparkDataSourceUtils.PARTITIONING_COLUMNS_KEY)
.map(SparkDataSourceUtils.decodePartitioningColumns)
.getOrElse(Nil)
val keyGeneratorClass = optParams.getOrElse(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key(),
DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.defaultValue)
keyGeneratorClass match {
// CustomKeyGenerator needs special treatment, because it needs to be specified in a way
// such as "field1:PartitionKeyType1,field2:PartitionKeyType2".
// partitionBy can specify the partition like this: partitionBy("p1", "p2:SIMPLE", "p3:TIMESTAMP")
case c if (c.nonEmpty && c == classOf[CustomKeyGenerator].getName) =>
val partitionPathField = partitionColumns.map(e => {
if (e.contains(":")) {
e
} else {
s"$e:SIMPLE"
}
}).mkString(",")
translatedOptParams = optParams ++ Map(PARTITIONPATH_FIELD.key -> partitionPathField)
case c if (c.isEmpty || !keyGeneratorClass.equals(classOf[NonpartitionedKeyGenerator].getName)) =>
// for any key gen other than NonPartitioned key gen, we can override the partition field config.
val partitionPathField = partitionColumns.mkString(",")
translatedOptParams = optParams ++ Map(PARTITIONPATH_FIELD.key -> partitionPathField)
case _ => // no op incase of NonPartitioned Key gen.
}
}
translatedOptParams
}
val TABLE_NAME: ConfigProperty[String] = ConfigProperty
.key(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY)
.noDefaultValue()
.markAdvanced()
.withDocumentation("Table name for the datasource write. Also used to register the table into meta stores.")
/**
* Field used in preCombining before actual write. When two records have the same
* key value, we will pick the one with the largest value for the precombine field,
* determined by Object.compareTo(..)
*/
val PRECOMBINE_FIELD = HoodieWriteConfig.PRECOMBINE_FIELD_NAME
/**
* Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting.
* This will render any value set for `PRECOMBINE_FIELD_OPT_VAL` in-effective
*/
val PAYLOAD_CLASS_NAME = HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME
val PAYLOAD_TYPE = HoodieWriteConfig.WRITE_PAYLOAD_TYPE
/**
* HoodieMerger will replace the payload to process the merge of data
* and provide the same capabilities as the payload
*/
val RECORD_MERGER_IMPLS = HoodieWriteConfig.RECORD_MERGER_IMPLS
/**
* Id of merger strategy
*/
val RECORD_MERGER_STRATEGY = HoodieWriteConfig.RECORD_MERGER_STRATEGY
/**
* Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value
* will be obtained by invoking .toString() on the field value. Nested fields can be specified using
* the dot notation eg: `a.b.c`
*
*/
val RECORDKEY_FIELD = KeyGeneratorOptions.RECORDKEY_FIELD_NAME
/**
* Secondary key field. Columns to be used as the secondary index columns. Actual value
* will be obtained by invoking .toString() on the field value. Nested fields can be specified using
* the dot notation eg: `a.b.c`
*/
val SECONDARYKEY_COLUMN_NAME = KeyGeneratorOptions.SECONDARYKEY_COLUMN_NAME
/**
* Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual
* value obtained by invoking .toString()
*/
val PARTITIONPATH_FIELD = KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME
/**
* Flag to indicate whether to use Hive style partitioning.
* If set true, the names of partition folders follow = format.
* By default false (the names of partition folders are only partition values)
*/
val HIVE_STYLE_PARTITIONING = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE
/**
* Key generator class, that implements will extract the key out of incoming record.
*/
val keyGeneratorInferFunc = JFunction.toJavaFunction((config: HoodieConfig) => {
Option.of(DataSourceOptionsHelper.inferKeyGenClazz(config.getProps))
})
val KEYGENERATOR_CLASS_NAME: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.keygenerator.class")
.defaultValue(classOf[SimpleKeyGenerator].getName)
.withInferFunction(keyGeneratorInferFunc)
.markAdvanced()
.withDocumentation("Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator`")
val KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED: ConfigProperty[String] = KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED
val ENABLE_ROW_WRITER: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.row.writer.enable")
.defaultValue("true")
.withInferFunction(
JFunction.toJavaFunction((config: HoodieConfig) => {
if (config.getString(OPERATION) == WriteOperationType.BULK_INSERT.value
&& !config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS)
&& config.getBooleanOrDefault(HoodieWriteConfig.COMBINE_BEFORE_INSERT)) {
// need to turn off row writing for BULK_INSERT without meta fields with turned on COMBINE_BEFORE_INSERT to prevent shortcutting and ignoring COMBINE_BEFORE_INSERT setting
Option.of("false")
} else {
Option.empty()
}
})
)
.markAdvanced()
.withDocumentation("When set to true, will perform write operations directly using the spark native " +
"`Row` representation, avoiding any additional conversion costs.")
/**
* Enable the bulk insert for sql insert statement.
*/
@Deprecated
val SQL_ENABLE_BULK_INSERT: ConfigProperty[String] = ConfigProperty
.key("hoodie.sql.bulk.insert.enable")
.defaultValue("false")
.markAdvanced()
.deprecatedAfter("0.14.0")
.withDocumentation("When set to true, the sql insert statement will use bulk insert. " +
"This config is deprecated as of 0.14.0. Please use hoodie.spark.sql.insert.into.operation instead.")
@Deprecated
val SQL_INSERT_MODE: ConfigProperty[String] = ConfigProperty
.key("hoodie.sql.insert.mode")
.defaultValue("upsert")
.markAdvanced()
.deprecatedAfter("0.14.0")
.withDocumentation("Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict." +
"For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record." +
"For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record." +
"While for non-strict mode, hudi just do the insert operation for the pk-table. This config is deprecated as of 0.14.0. Please use " +
"hoodie.spark.sql.insert.into.operation and hoodie.datasource.insert.dup.policy as you see fit.")
val COMMIT_METADATA_KEYPREFIX: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.commitmeta.key.prefix")
.defaultValue("_")
.markAdvanced()
.withDocumentation("Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata. " +
"This is useful to store checkpointing information, in a consistent way with the hudi timeline")
@Deprecated
val INSERT_DROP_DUPS: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.insert.drop.duplicates")
.defaultValue("false")
.markAdvanced()
.withDocumentation("If set to true, records from the incoming dataframe will not overwrite existing records with the same key during the write operation. " +
"
**Note** Just for Insert operation in Spark SQL writing since 0.14.0, users can switch to the config `hoodie.datasource.insert.dup.policy` instead " +
"for a simplified duplicate handling experience. The new config will be incorporated into all other writing flows and this config will be fully deprecated " +
"in future releases.");
val PARTITIONS_TO_DELETE: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.partitions.to.delete")
.noDefaultValue()
.markAdvanced()
.withDocumentation("Comma separated list of partitions to delete. Allows use of wildcard *")
val STREAMING_RETRY_CNT: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.streaming.retry.count")
.defaultValue("3")
.markAdvanced()
.withDocumentation("Config to indicate how many times streaming job should retry for a failed micro batch.")
val STREAMING_RETRY_INTERVAL_MS: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.streaming.retry.interval.ms")
.defaultValue("2000")
.markAdvanced()
.withDocumentation(" Config to indicate how long (by millisecond) before a retry should issued for failed microbatch")
/**
* By default false. If users prefer streaming progress over data integrity, can set this to true.
*/
val STREAMING_IGNORE_FAILED_BATCH: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.streaming.ignore.failed.batch")
.defaultValue("false")
.markAdvanced()
.withDocumentation("Config to indicate whether to ignore any non exception error (e.g. writestatus error)"
+ " within a streaming microbatch. Turning this on, could hide the write status errors while the spark checkpoint moves ahead." +
"So, would recommend users to use this with caution.")
val STREAMING_CHECKPOINT_IDENTIFIER: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.streaming.checkpoint.identifier")
.defaultValue("default_single_writer")
.markAdvanced()
.sinceVersion("0.13.0")
.withDocumentation("A stream identifier used for HUDI to fetch the right checkpoint(`batch id` to be more specific) "
+ "corresponding this writer. Please note that keep the identifier an unique value for different writer "
+ "if under multi-writer scenario. If the value is not set, will only keep the checkpoint info in the memory. "
+ "This could introduce the potential issue that the job is restart(`batch id` is lost) while spark checkpoint write fails, "
+ "causing spark will retry and rewrite the data.")
val STREAMING_DISABLE_COMPACTION: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.write.streaming.disable.compaction")
.defaultValue("false")
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("By default for MOR table, async compaction is enabled with spark streaming sink. "
+ "By setting this config to true, we can disable it and the expectation is that, users will schedule and execute "
+ "compaction in a different process/job altogether. Some users may wish to run it separately to manage resources "
+ "across table services and regular ingestion pipeline and so this could be preferred on such cases.")
val META_SYNC_CLIENT_TOOL_CLASS_NAME: ConfigProperty[String] = ConfigProperty
.key("hoodie.meta.sync.client.tool.class")
.defaultValue(classOf[HiveSyncTool].getName)
.markAdvanced()
.withDocumentation("Sync tool class name used to sync to metastore. Defaults to Hive.")
@Deprecated
val RECONCILE_SCHEMA: ConfigProperty[java.lang.Boolean] = HoodieCommonConfig.RECONCILE_SCHEMA
val SET_NULL_FOR_MISSING_COLUMNS: ConfigProperty[String] = HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS
val SPARK_SQL_INSERT_INTO_OPERATION: ConfigProperty[String] = ConfigProperty
.key("hoodie.spark.sql.insert.into.operation")
.defaultValue(WriteOperationType.INSERT.value())
.withValidValues(WriteOperationType.BULK_INSERT.value(), WriteOperationType.INSERT.value(), WriteOperationType.UPSERT.value())
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Sql write operation to use with INSERT_INTO spark sql command. This comes with 3 possible values, bulk_insert, " +
"insert and upsert. bulk_insert is generally meant for initial loads and is known to be performant compared to insert. But bulk_insert may not " +
"do small file management. If you prefer hudi to automatically manage small files, then you can go with \"insert\". There is no precombine " +
"(if there are duplicates within the same batch being ingested, same dups will be ingested) with bulk_insert and insert and there is no index " +
"look up as well. If you may use INSERT_INTO for mutable dataset, then you may have to set this config value to \"upsert\". With upsert, you will " +
"get both precombine and updates to existing records on storage is also honored. If not, you may see duplicates. ")
val ENABLE_MERGE_INTO_PARTIAL_UPDATES: ConfigProperty[Boolean] = ConfigProperty
.key("hoodie.spark.sql.merge.into.partial.updates")
.defaultValue(true)
.markAdvanced()
.sinceVersion("1.0.0")
.withDocumentation("Whether to write partial updates to the data blocks containing updates "
+ "in MOR tables with Spark SQL MERGE INTO statement. The data blocks containing partial "
+ "updates have a schema with a subset of fields compared to the full schema of the table.")
val NONE_INSERT_DUP_POLICY = "none"
val DROP_INSERT_DUP_POLICY = "drop"
val FAIL_INSERT_DUP_POLICY = "fail"
val INSERT_DUP_POLICY: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.insert.dup.policy")
.defaultValue(NONE_INSERT_DUP_POLICY)
.withValidValues(NONE_INSERT_DUP_POLICY, DROP_INSERT_DUP_POLICY, FAIL_INSERT_DUP_POLICY)
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("**Note** This is only applicable to Spark SQL writing.
When operation type is set to \"insert\", users can optionally enforce a dedup policy. This policy will be employed "
+ " when records being ingested already exists in storage. Default policy is none and no action will be taken. Another option is to choose " +
" \"drop\", on which matching records from incoming will be dropped and the rest will be ingested. Third option is \"fail\" which will " +
"fail the write operation when same records are re-ingested. In other words, a given record as deduced by the key generation policy " +
"can be ingested only once to the target table of interest.")
// HIVE SYNC SPECIFIC CONFIGS
// NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
// unexpected issues with config getting reset
/**
* @deprecated Hive Specific Configs are moved to {@link HiveSyncConfig}
*/
@Deprecated
val HIVE_SYNC_ENABLED: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_ENABLED
@Deprecated
val META_SYNC_ENABLED: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ENABLED
@Deprecated
val HIVE_DATABASE: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_DATABASE_NAME
@Deprecated
val HIVE_TABLE: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_TABLE_NAME
@Deprecated
val HIVE_BASE_FILE_FORMAT: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT
@Deprecated
val HIVE_USER: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USER
@Deprecated
val HIVE_PASS: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_PASS
@Deprecated
val HIVE_URL: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_URL
@Deprecated
val METASTORE_URIS: ConfigProperty[String] = HiveSyncConfigHolder.METASTORE_URIS
@Deprecated
val HIVE_PARTITION_FIELDS: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS
@Deprecated
val HIVE_PARTITION_EXTRACTOR_CLASS: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS
@Deprecated
val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT
/** @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 */
@Deprecated
val HIVE_USE_JDBC: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_USE_JDBC
@Deprecated
val HIVE_AUTO_CREATE_DATABASE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE
@Deprecated
val HIVE_IGNORE_EXCEPTIONS: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS
@Deprecated
val HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE
@Deprecated
val HIVE_SUPPORT_TIMESTAMP_TYPE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE
/**
* Flag to indicate whether to use conditional syncing in HiveSync.
* If set true, the Hive sync procedure will only run if partition or schema changes are detected.
* By default true.
*/
@Deprecated
val HIVE_CONDITIONAL_SYNC: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC
@Deprecated
val HIVE_TABLE_PROPERTIES: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_TABLE_PROPERTIES
@Deprecated
val HIVE_TABLE_SERDE_PROPERTIES: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_TABLE_SERDE_PROPERTIES
@Deprecated
val HIVE_SYNC_AS_DATA_SOURCE_TABLE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE
// Create table as managed table
@Deprecated
val HIVE_CREATE_MANAGED_TABLE: ConfigProperty[java.lang.Boolean] = HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE
@Deprecated
val HIVE_BATCH_SYNC_PARTITION_NUM: ConfigProperty[java.lang.Integer] = HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM
@Deprecated
val HIVE_SYNC_MODE: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_MODE
@Deprecated
val HIVE_SYNC_BUCKET_SYNC: ConfigProperty[java.lang.Boolean] = HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC
@Deprecated
val HIVE_SYNC_COMMENT: ConfigProperty[String] = HiveSyncConfigHolder.HIVE_SYNC_COMMENT;
// Async Compaction - Enabled by default for MOR
val ASYNC_COMPACT_ENABLE: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.compaction.async.enable")
.defaultValue("true")
.markAdvanced()
.withDocumentation("Controls whether async compaction should be turned on for MOR table writing.")
val INLINE_CLUSTERING_ENABLE = HoodieClusteringConfig.INLINE_CLUSTERING
val ASYNC_CLUSTERING_ENABLE = HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE
val KAFKA_AVRO_VALUE_DESERIALIZER_CLASS: ConfigProperty[String] = ConfigProperty
.key(STREAMER_CONFIG_PREFIX + "source.kafka.value.deserializer.class")
.defaultValue("io.confluent.kafka.serializers.KafkaAvroDeserializer")
.withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "source.kafka.value.deserializer.class")
.markAdvanced()
.sinceVersion("0.9.0")
.withDocumentation("This class is used by kafka client to deserialize the records")
val DROP_PARTITION_COLUMNS: ConfigProperty[java.lang.Boolean] = HoodieTableConfig.DROP_PARTITION_COLUMNS
val SPARK_SQL_OPTIMIZED_WRITES: ConfigProperty[String] = ConfigProperty
.key("hoodie.spark.sql.optimized.writes.enable")
.defaultValue("true")
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Controls whether spark sql prepped update, delete, and merge are enabled.")
val OVERWRITE_MODE: ConfigProperty[String] = ConfigProperty
.key("hoodie.datasource.overwrite.mode")
.noDefaultValue()
.withValidValues("STATIC", "DYNAMIC")
.markAdvanced()
.sinceVersion("0.14.0")
.withDocumentation("Controls whether overwrite use dynamic or static mode, if not configured, " +
"respect spark.sql.sources.partitionOverwriteMode")
/** @deprecated Use {@link HIVE_USE_PRE_APACHE_INPUT_FORMAT} and its methods instead */
@Deprecated
val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = HiveSyncConfigHolder.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key()
/** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */
@Deprecated
val HIVE_USE_JDBC_OPT_KEY = HiveSyncConfigHolder.HIVE_USE_JDBC.key()
/** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */
@Deprecated
val HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE.key()
/** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */
@Deprecated
val HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS.key()
/** @deprecated Use {@link STREAMING_IGNORE_FAILED_BATCH} and its methods instead */
@Deprecated
val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = STREAMING_IGNORE_FAILED_BATCH.key()
/** @deprecated Use {@link STREAMING_IGNORE_FAILED_BATCH} and its methods instead */
@Deprecated
val DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL = STREAMING_IGNORE_FAILED_BATCH.defaultValue()
/** @deprecated Use {@link META_SYNC_CLIENT_TOOL_CLASS_NAME} and its methods instead */
@Deprecated
val META_SYNC_CLIENT_TOOL_CLASS = META_SYNC_CLIENT_TOOL_CLASS_NAME.key()
/** @deprecated Use {@link META_SYNC_CLIENT_TOOL_CLASS_NAME} and its methods instead */
@Deprecated
val DEFAULT_META_SYNC_CLIENT_TOOL_CLASS = META_SYNC_CLIENT_TOOL_CLASS_NAME.defaultValue()
/** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */
@Deprecated
val HIVE_SYNC_ENABLED_OPT_KEY = HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key()
/** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */
@Deprecated
val META_SYNC_ENABLED_OPT_KEY = HoodieSyncConfig.META_SYNC_ENABLED.key()
/** @deprecated Use {@link HIVE_DATABASE} and its methods instead */
@Deprecated
val HIVE_DATABASE_OPT_KEY = HoodieSyncConfig.META_SYNC_DATABASE_NAME.key()
/** @deprecated Use {@link HIVE_TABLE} and its methods instead */
@Deprecated
val HIVE_TABLE_OPT_KEY = HoodieSyncConfig.META_SYNC_TABLE_NAME.key()
/** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */
@Deprecated
val HIVE_BASE_FILE_FORMAT_OPT_KEY = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key()
/** @deprecated Use {@link HIVE_USER} and its methods instead */
@Deprecated
val HIVE_USER_OPT_KEY = HiveSyncConfigHolder.HIVE_USER.key()
/** @deprecated Use {@link HIVE_PASS} and its methods instead */
@Deprecated
val HIVE_PASS_OPT_KEY = HiveSyncConfigHolder.HIVE_PASS.key()
/** @deprecated Use {@link HIVE_URL} and its methods instead */
@Deprecated
val HIVE_URL_OPT_KEY = HiveSyncConfigHolder.HIVE_URL.key()
/** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */
@Deprecated
val HIVE_PARTITION_FIELDS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key()
/** @deprecated Use {@link HIVE_PARTITION_EXTRACTOR_CLASS} and its methods instead */
@Deprecated
val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key()
/** @deprecated Use {@link KEYGENERATOR_CLASS_NAME} and its methods instead */
@Deprecated
val DEFAULT_KEYGENERATOR_CLASS_OPT_VAL = KEYGENERATOR_CLASS_NAME.defaultValue()
/** @deprecated Use {@link KEYGENERATOR_CLASS_NAME} and its methods instead */
@Deprecated
val KEYGENERATOR_CLASS_OPT_KEY = HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key()
/** @deprecated Use {@link ENABLE_ROW_WRITER} and its methods instead */
@Deprecated
val ENABLE_ROW_WRITER_OPT_KEY = ENABLE_ROW_WRITER.key()
/** @deprecated Use {@link ENABLE_ROW_WRITER} and its methods instead */
@Deprecated
val DEFAULT_ENABLE_ROW_WRITER_OPT_VAL = ENABLE_ROW_WRITER.defaultValue()
/** @deprecated Use {@link HIVE_STYLE_PARTITIONING} and its methods instead */
@Deprecated
val HIVE_STYLE_PARTITIONING_OPT_KEY = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key()
/** @deprecated Use {@link HIVE_STYLE_PARTITIONING} and its methods instead */
@Deprecated
val DEFAULT_HIVE_STYLE_PARTITIONING_OPT_VAL = HIVE_STYLE_PARTITIONING.defaultValue()
val URL_ENCODE_PARTITIONING = KeyGeneratorOptions.URL_ENCODE_PARTITIONING
/** @deprecated Use {@link URL_ENCODE_PARTITIONING} and its methods instead */
@Deprecated
val URL_ENCODE_PARTITIONING_OPT_KEY = KeyGeneratorOptions.URL_ENCODE_PARTITIONING.key()
/** @deprecated Use {@link URL_ENCODE_PARTITIONING} and its methods instead */
@Deprecated
val DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL = URL_ENCODE_PARTITIONING.defaultValue()
/** @deprecated Use {@link COMMIT_METADATA_KEYPREFIX} and its methods instead */
@Deprecated
val COMMIT_METADATA_KEYPREFIX_OPT_KEY = COMMIT_METADATA_KEYPREFIX.key()
/** @deprecated Use {@link COMMIT_METADATA_KEYPREFIX} and its methods instead */
@Deprecated
val DEFAULT_COMMIT_METADATA_KEYPREFIX_OPT_VAL = COMMIT_METADATA_KEYPREFIX.defaultValue()
/** @deprecated Use {@link INSERT_DROP_DUPS} and its methods instead */
@Deprecated
val INSERT_DROP_DUPS_OPT_KEY = INSERT_DROP_DUPS.key()
/** @deprecated Use {@link INSERT_DROP_DUPS} and its methods instead */
@Deprecated
val DEFAULT_INSERT_DROP_DUPS_OPT_VAL = INSERT_DROP_DUPS.defaultValue()
/** @deprecated Use {@link STREAMING_RETRY_CNT} and its methods instead */
@Deprecated
val STREAMING_RETRY_CNT_OPT_KEY = STREAMING_RETRY_CNT.key()
/** @deprecated Use {@link STREAMING_RETRY_CNT} and its methods instead */
@Deprecated
val DEFAULT_STREAMING_RETRY_CNT_OPT_VAL = STREAMING_RETRY_CNT.defaultValue()
/** @deprecated Use {@link STREAMING_RETRY_INTERVAL_MS} and its methods instead */
@Deprecated
val STREAMING_RETRY_INTERVAL_MS_OPT_KEY = STREAMING_RETRY_INTERVAL_MS.key()
/** @deprecated Use {@link STREAMING_RETRY_INTERVAL_MS} and its methods instead */
@Deprecated
val DEFAULT_STREAMING_RETRY_INTERVAL_MS_OPT_VAL = STREAMING_RETRY_INTERVAL_MS.defaultValue()
/** @deprecated Use {@link RECORDKEY_FIELD} and its methods instead */
@Deprecated
val RECORDKEY_FIELD_OPT_KEY = KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()
/** @deprecated Use {@link RECORDKEY_FIELD} and its methods instead */
@Deprecated
val PARTITIONPATH_FIELD_OPT_KEY = KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()
/** @deprecated Use {@link PARTITIONPATH_FIELD} and its methods instead */
@Deprecated
val DEFAULT_PARTITIONPATH_FIELD_OPT_VAL = null
/** @deprecated Use {@link TABLE_NAME} and its methods instead */
@Deprecated
val TABLE_NAME_OPT_KEY = TABLE_NAME.key()
/** @deprecated Use {@link PRECOMBINE_FIELD} and its methods instead */
@Deprecated
val PRECOMBINE_FIELD_OPT_KEY = HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key()
/** @deprecated Use {@link PRECOMBINE_FIELD} and its methods instead */
@Deprecated
val DEFAULT_PRECOMBINE_FIELD_OPT_VAL = PRECOMBINE_FIELD.defaultValue()
/** @deprecated Use {@link HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME} and its methods instead */
@Deprecated
val PAYLOAD_CLASS_OPT_KEY = HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME.key()
/** @deprecated Use {@link HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME} and its methods instead */
@Deprecated
val DEFAULT_PAYLOAD_OPT_VAL = PAYLOAD_CLASS_NAME.defaultValue()
/** @deprecated Use {@link TABLE_TYPE} and its methods instead */
@Deprecated
val TABLE_TYPE_OPT_KEY = TABLE_TYPE.key()
/** @deprecated Use {@link TABLE_TYPE} and its methods instead */
@Deprecated
val DEFAULT_TABLE_TYPE_OPT_VAL = TABLE_TYPE.defaultValue()
/** @deprecated Use {@link TABLE_TYPE} and its methods instead */
@Deprecated
val STORAGE_TYPE_OPT_KEY = "hoodie.datasource.write.storage.type"
@Deprecated
val COW_STORAGE_TYPE_OPT_VAL = HoodieTableType.COPY_ON_WRITE.name
@Deprecated
val MOR_STORAGE_TYPE_OPT_VAL = HoodieTableType.MERGE_ON_READ.name
/** @deprecated Use {@link TABLE_TYPE} and its methods instead */
@Deprecated
val DEFAULT_STORAGE_TYPE_OPT_VAL = COW_STORAGE_TYPE_OPT_VAL
/** @deprecated Use {@link OPERATION} and its methods instead */
@Deprecated
val OPERATION_OPT_KEY = OPERATION.key()
/** @deprecated Use {@link OPERATION} and its methods instead */
@Deprecated
val DEFAULT_OPERATION_OPT_VAL = OPERATION.defaultValue()
/** @deprecated Use {@link HIVE_SYNC_ENABLED} and its methods instead */
@Deprecated
val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = HiveSyncConfigHolder.HIVE_SYNC_ENABLED.defaultValue()
/** @deprecated Use {@link META_SYNC_ENABLED} and its methods instead */
@Deprecated
val DEFAULT_META_SYNC_ENABLED_OPT_VAL = HoodieSyncConfig.META_SYNC_ENABLED.defaultValue()
/** @deprecated Use {@link HIVE_DATABASE} and its methods instead */
@Deprecated
val DEFAULT_HIVE_DATABASE_OPT_VAL = HoodieSyncConfig.META_SYNC_DATABASE_NAME.defaultValue()
/** @deprecated Use {@link HIVE_TABLE} and its methods instead */
@Deprecated
val DEFAULT_HIVE_TABLE_OPT_VAL = HoodieSyncConfig.META_SYNC_TABLE_NAME.defaultValue()
/** @deprecated Use {@link HIVE_BASE_FILE_FORMAT} and its methods instead */
@Deprecated
val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue()
/** @deprecated Use {@link HIVE_USER} and its methods instead */
@Deprecated
val DEFAULT_HIVE_USER_OPT_VAL = HiveSyncConfigHolder.HIVE_USER.defaultValue()
/** @deprecated Use {@link HIVE_PASS} and its methods instead */
@Deprecated
val DEFAULT_HIVE_PASS_OPT_VAL = HiveSyncConfigHolder.HIVE_PASS.defaultValue()
/** @deprecated Use {@link HIVE_URL} and its methods instead */
@Deprecated
val DEFAULT_HIVE_URL_OPT_VAL = HiveSyncConfigHolder.HIVE_URL.defaultValue()
/** @deprecated Use {@link HIVE_PARTITION_FIELDS} and its methods instead */
@Deprecated
val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.defaultValue()
/** @deprecated Use {@link HIVE_PARTITION_EXTRACTOR_CLASS} and its methods instead */
@Deprecated
val DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.defaultValue()
@Deprecated
val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false"
/** @deprecated Use {@link HIVE_USE_JDBC} and its methods instead */
@Deprecated
val DEFAULT_HIVE_USE_JDBC_OPT_VAL = HiveSyncConfigHolder.HIVE_USE_JDBC.defaultValue()
/** @deprecated Use {@link HIVE_AUTO_CREATE_DATABASE} and its methods instead */
@Deprecated
val DEFAULT_HIVE_AUTO_CREATE_DATABASE_OPT_KEY = HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE.defaultValue()
/** @deprecated Use {@link HIVE_IGNORE_EXCEPTIONS} and its methods instead */
@Deprecated
val DEFAULT_HIVE_IGNORE_EXCEPTIONS_OPT_KEY = HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS.defaultValue()
/** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */
@Deprecated
val HIVE_SKIP_RO_SUFFIX = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.key()
/** @deprecated Use {@link HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE} and its methods instead */
@Deprecated
val DEFAULT_HIVE_SKIP_RO_SUFFIX_VAL = HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue()
/** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */
@Deprecated
val HIVE_SUPPORT_TIMESTAMP = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key()
/** @deprecated Use {@link HIVE_SUPPORT_TIMESTAMP_TYPE} and its methods instead */
@Deprecated
val DEFAULT_HIVE_SUPPORT_TIMESTAMP = HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.defaultValue()
/** @deprecated Use {@link ASYNC_COMPACT_ENABLE} and its methods instead */
@Deprecated
val ASYNC_COMPACT_ENABLE_OPT_KEY = ASYNC_COMPACT_ENABLE.key()
/** @deprecated Use {@link ASYNC_COMPACT_ENABLE} and its methods instead */
@Deprecated
val DEFAULT_ASYNC_COMPACT_ENABLE_OPT_VAL = ASYNC_COMPACT_ENABLE.defaultValue()
/** @deprecated Use {@link KAFKA_AVRO_VALUE_DESERIALIZER_CLASS} and its methods instead */
@Deprecated
val KAFKA_AVRO_VALUE_DESERIALIZER = KAFKA_AVRO_VALUE_DESERIALIZER_CLASS.key()
@Deprecated
val SCHEMA_PROVIDER_CLASS_PROP = STREAMER_CONFIG_PREFIX + "schemaprovider.class"
}
object DataSourceOptionsHelper {
private val log = LoggerFactory.getLogger(DataSourceOptionsHelper.getClass)
// put all the configs with alternatives here
private val allConfigsWithAlternatives = List(
DataSourceReadOptions.QUERY_TYPE,
DataSourceWriteOptions.TABLE_TYPE,
HoodieTableConfig.BASE_FILE_FORMAT,
HoodieTableConfig.LOG_FILE_FORMAT
)
// put all the deprecated configs here
val allDeprecatedConfigs: Set[String] = Set(
ConsistencyGuardConfig.ENABLE.key
)
// maps the deprecated config name to its latest name
val allAlternatives: Map[String, String] = {
val alterMap = scala.collection.mutable.Map[String, String]()
allConfigsWithAlternatives.foreach(cfg => cfg.getAlternatives.asScala.foreach(alternative => alterMap(alternative) = cfg.key))
alterMap.toMap
}
val viewTypeValueMap: Map[String, String] = Map(
DataSourceReadOptions.VIEW_TYPE_READ_OPTIMIZED_OPT_VAL -> DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL,
DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL -> DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL,
DataSourceReadOptions.VIEW_TYPE_REALTIME_OPT_VAL -> DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
def translateConfigurations(optParams: Map[String, String]): Map[String, String] = {
val translatedOpt = scala.collection.mutable.Map[String, String]() ++= optParams
optParams.keySet.foreach(opt => {
if (allAlternatives.contains(opt) && !optParams.contains(allAlternatives(opt))) {
log.warn(opt + " is deprecated and will be removed in a later release; Please use " + allAlternatives(opt))
if (opt == DataSourceReadOptions.VIEW_TYPE_OPT_KEY) {
// special handle for VIEW_TYPE, also need to translate its values
translatedOpt ++= Map(allAlternatives(opt) -> viewTypeValueMap(optParams(opt)))
} else {
translatedOpt ++= Map(allAlternatives(opt) -> optParams(opt))
}
}
if (allDeprecatedConfigs.contains(opt)) {
log.warn(opt + " is deprecated and should never be used anymore")
}
})
translatedOpt.toMap
}
/**
* Some config keys differ from what user sets and whats part of table Config. this method assists in fetching the
* right table config and populating write configs.
* @param tableConfig table config of interest.
* @param params incoming write params.
* @return missing params that needs to be added to incoming write params
*/
def fetchMissingWriteConfigsFromTableConfig(tableConfig: HoodieTableConfig, params: Map[String, String]) : Map[String, String] = {
val missingWriteConfigs = scala.collection.mutable.Map[String, String]()
if (!params.contains(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()) && tableConfig.getRawRecordKeyFieldProp != null) {
missingWriteConfigs ++= Map(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key() -> tableConfig.getRawRecordKeyFieldProp)
}
if (!params.contains(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()) && tableConfig.getPartitionFieldProp != null) {
missingWriteConfigs ++= Map(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key() -> tableConfig.getPartitionFieldProp)
}
if (!params.contains(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key()) && tableConfig.getKeyGeneratorClassName != null) {
missingWriteConfigs ++= Map(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key() -> tableConfig.getKeyGeneratorClassName)
}
if (!params.contains(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key()) && tableConfig.getPreCombineField != null) {
missingWriteConfigs ++= Map(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key -> tableConfig.getPreCombineField)
}
if (!params.contains(HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME.key()) && tableConfig.getPayloadClass != null) {
missingWriteConfigs ++= Map(HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME.key() -> tableConfig.getPayloadClass)
}
if (!params.contains(DataSourceWriteOptions.TABLE_TYPE.key())) {
missingWriteConfigs ++= Map(DataSourceWriteOptions.TABLE_TYPE.key() -> tableConfig.getTableType.name())
}
missingWriteConfigs.toMap
}
def parametersWithReadDefaults(parameters: Map[String, String]): Map[String, String] = {
// First check if the ConfigUtils.IS_QUERY_AS_RO_TABLE has set by HiveSyncTool,
// or else use query type from QUERY_TYPE.
val paramsWithGlobalProps = DFSPropertiesConfiguration.getGlobalProps.asScala.toMap ++ parameters
val queryType = paramsWithGlobalProps.get(IS_QUERY_AS_RO_TABLE)
.map(is => if (is.toBoolean) QUERY_TYPE_READ_OPTIMIZED_OPT_VAL else QUERY_TYPE_SNAPSHOT_OPT_VAL)
.getOrElse(paramsWithGlobalProps.getOrElse(QUERY_TYPE.key, QUERY_TYPE.defaultValue()))
Map(
QUERY_TYPE.key -> queryType
) ++ translateConfigurations(paramsWithGlobalProps)
}
def inferKeyGenClazz(props: TypedProperties): String = {
getKeyGeneratorClassNameFromType(inferKeyGeneratorTypeFromWriteConfig(props))
}
def inferKeyGenClazz(recordsKeyFields: String, partitionFields: String): String = {
getKeyGeneratorClassNameFromType(inferKeyGeneratorType(Option.ofNullable(recordsKeyFields), partitionFields))
}
implicit def convert[T, U](prop: ConfigProperty[T])(implicit converter: T => U): ConfigProperty[U] = {
checkState(prop.hasDefaultValue)
var newProp: ConfigProperty[U] = ConfigProperty.key(prop.key())
.defaultValue(converter(prop.defaultValue()))
.withDocumentation(prop.doc())
.withAlternatives(prop.getAlternatives.asScala.toSeq: _*)
newProp = toScalaOption(prop.getSinceVersion) match {
case Some(version) => newProp.sinceVersion(version)
case None => newProp
}
newProp = toScalaOption(prop.getDeprecatedVersion) match {
case Some(version) => newProp.deprecatedAfter(version)
case None => newProp
}
newProp
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy