All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.hudi.ProvidesHoodieConfig.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.hudi

import org.apache.hudi.{DataSourceWriteOptions, HoodieFileIndex}
import org.apache.hudi.AutoRecordKeyGenerationUtils.shouldAutoGenerateRecordKeys
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.HoodieConversionUtils.toProperties
import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieCommonConfig, RecordMergeMode, TypedProperties}
import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecordMerger, WriteOperationType}
import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.common.util.{ReflectionUtils, StringUtils}
import org.apache.hudi.config.{HoodieIndexConfig, HoodieInternalConfig, HoodieWriteConfig}
import org.apache.hudi.common.table.HoodieTableConfig.DATABASE_NAME
import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, MultiPartKeysValueExtractor}
import org.apache.hudi.hive.ddl.HiveSyncMode
import org.apache.hudi.keygen.{ComplexKeyGenerator, CustomAvroKeyGenerator, CustomKeyGenerator}
import org.apache.hudi.sql.InsertMode
import org.apache.hudi.sync.common.HoodieSyncConfig

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal}
import org.apache.spark.sql.execution.datasources.FileStatusCache
import org.apache.spark.sql.hive.HiveExternalCatalog
import org.apache.spark.sql.hudi.HoodieOptionConfig.mapSqlOptionsToDataSourceWriteConfigs
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{filterHoodieConfigs, isUsingHiveCatalog}
import org.apache.spark.sql.hudi.ProvidesHoodieConfig.{combineOptions, getPartitionPathFieldWriteConfig}
import org.apache.spark.sql.hudi.command.{SqlKeyGenerator, ValidateDuplicateKeyPayload}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.PARTITION_OVERWRITE_MODE
import org.apache.spark.sql.types.StructType
import org.slf4j.LoggerFactory

import java.util.Locale

import scala.collection.JavaConverters._

trait ProvidesHoodieConfig extends Logging {

  def buildHoodieConfig(hoodieCatalogTable: HoodieCatalogTable): Map[String, String] = {
    val sparkSession: SparkSession = hoodieCatalogTable.spark
    val tableConfig = hoodieCatalogTable.tableConfig

    // NOTE: Here we fallback to "" to make sure that null value is not overridden with
    // default value ("ts")
    // TODO(HUDI-3456) clean up
    val preCombineField = Option(tableConfig.getPreCombineField).getOrElse("")
    val hiveSyncConfig = buildHiveSyncConfig(sparkSession, hoodieCatalogTable, tableConfig)

    val defaultOpts = Map[String, String](
      OPERATION.key -> UPSERT_OPERATION_OPT_VAL,
      KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
      SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName,
      SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL,
      HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
      HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME),
      HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME),
      HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp,
      HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS),
      HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString
    )

    val overridingOpts = Map[String, String](
      "path" -> hoodieCatalogTable.tableLocation,
      RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
      DATABASE_NAME.key -> hoodieCatalogTable.table.database,
      TBL_NAME.key -> hoodieCatalogTable.tableName,
      PRECOMBINE_FIELD.key -> preCombineField,
      HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable,
      URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning,
      PARTITIONPATH_FIELD.key -> getPartitionPathFieldWriteConfig(
        tableConfig.getKeyGeneratorClassName, tableConfig.getPartitionFieldProp, hoodieCatalogTable)
    )

    combineOptions(hoodieCatalogTable, tableConfig, sparkSession.sqlContext.conf,
      defaultOpts = defaultOpts, overridingOpts = overridingOpts)
  }


  private def deducePayloadClassNameLegacy(operation: String, tableType: String, insertMode: InsertMode): String = {
    if (operation == UPSERT_OPERATION_OPT_VAL &&
      tableType == COW_TABLE_TYPE_OPT_VAL && insertMode == InsertMode.STRICT) {
      // Validate duplicate key for COW, for MOR it will do the merge with the DefaultHoodieRecordPayload
      // on reading.
      // TODO use HoodieSparkValidateDuplicateKeyRecordMerger when SparkRecordMerger is default
      classOf[ValidateDuplicateKeyPayload].getCanonicalName
    } else if (operation == INSERT_OPERATION_OPT_VAL && tableType == COW_TABLE_TYPE_OPT_VAL &&
      insertMode == InsertMode.STRICT) {
      // Validate duplicate key for inserts to COW table when using strict insert mode.
      classOf[ValidateDuplicateKeyPayload].getCanonicalName
    } else {
      classOf[DefaultHoodieRecordPayload].getCanonicalName
    }
  }

  /**
   * Deduce the sql write operation for INSERT_INTO
   */
  private def deduceSparkSqlInsertIntoWriteOperation(isOverwritePartition: Boolean, isOverwriteTable: Boolean,
                                                     shouldAutoKeyGen: Boolean, preCombineField: String,
                                                     sparkSqlInsertIntoOperationSet: Boolean, sparkSqlInsertIntoOperation: String): String = {
    if (isOverwriteTable) {
      INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL
    } else if (isOverwritePartition) {
      INSERT_OVERWRITE_OPERATION_OPT_VAL
    } else if (!sparkSqlInsertIntoOperationSet && !shouldAutoKeyGen && preCombineField.nonEmpty) {
      UPSERT_OPERATION_OPT_VAL
    } else {
      sparkSqlInsertIntoOperation
    }
  }

  /**
   * Deduce the insert operation
   */
  private def deduceOperation(enableBulkInsert: Boolean, isOverwritePartition: Boolean, isOverwriteTable: Boolean,
                              dropDuplicate: Boolean, isNonStrictMode: Boolean, isPartitionedTable: Boolean,
                              combineBeforeInsert: Boolean, insertMode: InsertMode, autoGenerateRecordKeys: Boolean): String = {
    (enableBulkInsert, isOverwritePartition, isOverwriteTable, dropDuplicate, isNonStrictMode, isPartitionedTable, autoGenerateRecordKeys) match {
      case (true, _, _, _, false, _, _) =>
        throw new IllegalArgumentException(s"Table with primaryKey can not use bulk insert in ${insertMode.value()} mode.")
      case (true, _, _, true, _, _, _) =>
        throw new IllegalArgumentException(s"Bulk insert cannot support drop duplication." +
          s" Please disable $INSERT_DROP_DUPS and try again.")
      // Bulk insert with overwrite table
      case (true, false, true, _, _, _, _) =>
        BULK_INSERT_OPERATION_OPT_VAL
      // Bulk insert with overwrite table partition
      case (true, true, false, _, _, true, _) =>
        BULK_INSERT_OPERATION_OPT_VAL
      // insert overwrite table
      case (false, false, true, _, _, _, _) => INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL
      // insert overwrite partition
      case (false, true, false, _, _, true, _) => INSERT_OVERWRITE_OPERATION_OPT_VAL
      // disable dropDuplicate, and provide preCombineKey, use the upsert operation for strict and upsert mode.
      case (false, false, false, false, false, _, _) if combineBeforeInsert => UPSERT_OPERATION_OPT_VAL
      // if table is pk table and has enableBulkInsert use bulk insert for non-strict mode.
      case (true, false, false, _, true, _, _) => BULK_INSERT_OPERATION_OPT_VAL
      // if auto record key generation is enabled, use bulk_insert
      case (_, _, _, _, _, _, true) => BULK_INSERT_OPERATION_OPT_VAL
      // for the rest case, use the insert operation
      case _ => INSERT_OPERATION_OPT_VAL
    }
  }

  /**
   * Build the default config for insert.
   *
   * @return
   */
  def buildHoodieInsertConfig(hoodieCatalogTable: HoodieCatalogTable,
                              sparkSession: SparkSession,
                              isOverwritePartition: Boolean,
                              isOverwriteTable: Boolean,
                              insertPartitions: Map[String, Option[String]] = Map.empty,
                              extraOptions: Map[String, String],
                              staticOverwritePartitionPathOpt: Option[String] = Option.empty): Map[String, String] = {

    if (insertPartitions.nonEmpty &&
      (insertPartitions.keys.toSet != hoodieCatalogTable.partitionFields.toSet)) {
      throw new IllegalArgumentException(s"Insert partition fields" +
        s"[${insertPartitions.keys.mkString(" ")}]" +
        s" not equal to the defined partition in table[${hoodieCatalogTable.partitionFields.mkString(",")}]")
    }
    val path = hoodieCatalogTable.tableLocation
    val tableType = hoodieCatalogTable.tableTypeName
    val tableConfig = hoodieCatalogTable.tableConfig

    val combinedOpts: Map[String, String] = combineOptions(hoodieCatalogTable, tableConfig, sparkSession.sqlContext.conf,
      defaultOpts = Map.empty, overridingOpts = extraOptions)
    val hiveSyncConfig = buildHiveSyncConfig(sparkSession, hoodieCatalogTable, tableConfig, extraOptions)

    val partitionFieldsStr = hoodieCatalogTable.partitionFields.mkString(",")

    // NOTE: Here we fallback to "" to make sure that null value is not overridden with
    // default value ("ts")
    // TODO(HUDI-3456) clean up
    val preCombineField = combinedOpts.getOrElse(HoodieTableConfig.PRECOMBINE_FIELD.key,
      combinedOpts.getOrElse(PRECOMBINE_FIELD.key, ""))

    val hiveStylePartitioningEnable = Option(tableConfig.getHiveStylePartitioningEnable).getOrElse("true")
    val urlEncodePartitioning = Option(tableConfig.getUrlEncodePartitioning).getOrElse("false")
    val keyGeneratorClassName = Option(tableConfig.getKeyGeneratorClassName)
      .getOrElse(classOf[ComplexKeyGenerator].getCanonicalName)

    val enableBulkInsert = combinedOpts.getOrElse(DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key,
      DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.defaultValue()).toBoolean
    val dropDuplicate = sparkSession.conf
      .getOption(INSERT_DROP_DUPS.key).getOrElse(INSERT_DROP_DUPS.defaultValue).toBoolean
    val shouldAutoKeyGen: Boolean = shouldAutoGenerateRecordKeys(combinedOpts)

    val insertMode = InsertMode.of(combinedOpts.getOrElse(DataSourceWriteOptions.SQL_INSERT_MODE.key,
      DataSourceWriteOptions.SQL_INSERT_MODE.defaultValue()))
    val insertModeSet = combinedOpts.contains(SQL_INSERT_MODE.key)
    val sparkSqlInsertIntoOperationOpt = combinedOpts.get(SPARK_SQL_INSERT_INTO_OPERATION.key())
    val sparkSqlInsertIntoOperationSet = sparkSqlInsertIntoOperationOpt.nonEmpty
    val sparkSqlInsertIntoOperation = sparkSqlInsertIntoOperationOpt.getOrElse(SPARK_SQL_INSERT_INTO_OPERATION.defaultValue())
    val insertDupPolicyOpt = combinedOpts.get(INSERT_DUP_POLICY.key())
    val insertDupPolicySet = insertDupPolicyOpt.nonEmpty
    val insertDupPolicy = combinedOpts.getOrElse(INSERT_DUP_POLICY.key(), INSERT_DUP_POLICY.defaultValue())
    val isNonStrictMode = insertMode == InsertMode.NON_STRICT
    val isPartitionedTable = hoodieCatalogTable.partitionFields.nonEmpty
    val combineBeforeInsert = hoodieCatalogTable.preCombineKey.nonEmpty && hoodieCatalogTable.primaryKeys.nonEmpty

    /*
     * The sql write operation has higher precedence than the legacy insert mode.
     * Only when the legacy insert mode is explicitly set, without setting sql write operation,
     * legacy configs will be honored. On all other cases (i.e when both are set, either is set,
     * or when only the sql write operation is set), we honor the sql write operation.
     */
    val useLegacyInsertModeFlow = insertModeSet && !sparkSqlInsertIntoOperationSet
    var operation = combinedOpts.getOrElse(OPERATION.key,
      if (useLegacyInsertModeFlow) {
        // NOTE: Target operation could be overridden by the user, therefore if it has been provided as an input
        //       we'd prefer that value over auto-deduced operation. Otherwise, we deduce target operation type
        deduceOperation(enableBulkInsert, isOverwritePartition, isOverwriteTable, dropDuplicate,
          isNonStrictMode, isPartitionedTable, combineBeforeInsert, insertMode, shouldAutoKeyGen)
      } else {
        deduceSparkSqlInsertIntoWriteOperation(isOverwritePartition, isOverwriteTable,
          shouldAutoKeyGen, preCombineField, sparkSqlInsertIntoOperationSet, sparkSqlInsertIntoOperation)
      }
    )

    val overwriteTableOpts = if (operation.equals(BULK_INSERT_OPERATION_OPT_VAL)) {
      if (isOverwriteTable) {
        Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE_TABLE.value())
      } else if (isOverwritePartition) {
        Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE.value())
      } else {
        Map()
      }
    } else if (operation.equals(INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL)) {
      if (sparkSqlInsertIntoOperation.equals(BULK_INSERT_OPERATION_OPT_VAL) || enableBulkInsert) {
        operation = BULK_INSERT_OPERATION_OPT_VAL
        Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE_TABLE.value())
      } else {
        Map()
      }
    } else if (operation.equals(INSERT_OVERWRITE_OPERATION_OPT_VAL)) {
      if (sparkSqlInsertIntoOperation.equals(BULK_INSERT_OPERATION_OPT_VAL) || enableBulkInsert) {
        operation = BULK_INSERT_OPERATION_OPT_VAL
        Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE.value())
      } else {
        Map()
      }
    } else {
      Map()
    }

    val staticOverwritePartitionPathOptions = staticOverwritePartitionPathOpt match {
      case Some(staticOverwritePartitionPath) =>
        Map(HoodieInternalConfig.STATIC_OVERWRITE_PARTITION_PATHS.key() -> staticOverwritePartitionPath)
      case _ =>
        Map()
    }

    // try to use new insert dup policy instead of legacy insert mode to deduce payload class. If only insert mode is explicitly specified,
    // w/o specifying any value for insert dup policy, legacy configs will be honored. But on all other cases (i.e when neither of the configs is set,
    // or when both configs are set, or when only insert dup policy is set), we honor insert dup policy and ignore the insert mode.
    val useLegacyInsertDropDupFlow = insertModeSet && !insertDupPolicySet
    val deducedPayloadClassName = if (useLegacyInsertDropDupFlow) {
      deducePayloadClassNameLegacy(operation, tableType, insertMode)
    } else {
      if (insertDupPolicy == FAIL_INSERT_DUP_POLICY) {
        classOf[ValidateDuplicateKeyPayload].getCanonicalName
      } else {
        classOf[DefaultHoodieRecordPayload].getCanonicalName
      }
    }

    val (recordMergeMode, recordMergeStrategy) = if (deducedPayloadClassName.equals(classOf[ValidateDuplicateKeyPayload].getCanonicalName)) {
      (RecordMergeMode.CUSTOM.name(), HoodieRecordMerger.PAYLOAD_BASED_MERGE_STRATEGY_UUID)
    } else {
      (RecordMergeMode.EVENT_TIME_ORDERING.name(), HoodieRecordMerger.DEFAULT_MERGE_STRATEGY_UUID)
    }

    if (tableConfig.getPayloadClass.equals(classOf[DefaultHoodieRecordPayload].getCanonicalName) &&
        tableConfig.getRecordMergeMode.equals(RecordMergeMode.EVENT_TIME_ORDERING)) {
      tableConfig.clearValue(HoodieTableConfig.PAYLOAD_CLASS_NAME)
      tableConfig.clearValue(HoodieTableConfig.RECORD_MERGE_MODE)
      tableConfig.clearValue(HoodieTableConfig.RECORD_MERGE_STRATEGY_ID)
    }

    val defaultOpts = Map(
      DataSourceWriteOptions.PAYLOAD_CLASS_NAME.key -> deducedPayloadClassName,
      DataSourceWriteOptions.RECORD_MERGE_MODE.key -> recordMergeMode,
      DataSourceWriteOptions.RECORD_MERGE_STRATEGY_ID.key() -> recordMergeStrategy,
      // NOTE: By default insert would try to do deduplication in case that pre-combine column is specified
      //       for the table
      HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(combineBeforeInsert),
      KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
      SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> keyGeneratorClassName,
      SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL,
      HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFieldsStr,
      HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
      HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME),
      HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME),
      HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
      HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS)
    )
    // for auto record key gen
    val recordKeyConfigValue = if (hoodieCatalogTable.primaryKeys.length > 1) {
      hoodieCatalogTable.primaryKeys.mkString(",")
    } else {
      null
    }

    val overridingOpts = extraOptions ++ Map(
      "path" -> path,
      TABLE_TYPE.key -> tableType,
      DATABASE_NAME.key -> hoodieCatalogTable.table.database,
      TBL_NAME.key -> hoodieCatalogTable.tableName,
      OPERATION.key -> operation,
      HIVE_STYLE_PARTITIONING.key -> hiveStylePartitioningEnable,
      URL_ENCODE_PARTITIONING.key -> urlEncodePartitioning,
      RECORDKEY_FIELD.key -> recordKeyConfigValue,
      PRECOMBINE_FIELD.key -> preCombineField,
      PARTITIONPATH_FIELD.key -> getPartitionPathFieldWriteConfig(
        keyGeneratorClassName, partitionFieldsStr, hoodieCatalogTable)
    ) ++ overwriteTableOpts ++ getDropDupsConfig(useLegacyInsertModeFlow, combinedOpts) ++ staticOverwritePartitionPathOptions

    combineOptions(hoodieCatalogTable, tableConfig, sparkSession.sqlContext.conf,
      defaultOpts = defaultOpts, overridingOpts = overridingOpts)
  }

  def getDropDupsConfig(useLegacyInsertModeFlow: Boolean, incomingParams: Map[String, String]): Map[String, String] = {
    if (!useLegacyInsertModeFlow) {
      Map(DataSourceWriteOptions.INSERT_DUP_POLICY.key() -> incomingParams.getOrElse(DataSourceWriteOptions.INSERT_DUP_POLICY.key(),
        DataSourceWriteOptions.INSERT_DUP_POLICY.defaultValue()),
        if (incomingParams.contains(DataSourceWriteOptions.INSERT_DUP_POLICY.key()) &&
          incomingParams(DataSourceWriteOptions.INSERT_DUP_POLICY.key()) == DROP_INSERT_DUP_POLICY) {
          DataSourceWriteOptions.INSERT_DROP_DUPS.key() -> "true"
        } else {
          DataSourceWriteOptions.INSERT_DROP_DUPS.key() -> "false"
        }
      )
    } else {
      Map()
    }
  }

  /**
   * Deduce the overwrite config based on writeOperation and overwriteMode config.
   * If hoodie.datasource.write.operation is insert_overwrite/insert_overwrite_table, use dynamic overwrite;
   * else if hoodie.datasource.overwrite.mode is configured, use it;
   * else use spark.sql.sources.partitionOverwriteMode.
   *
   * The returned staticOverwritePartitionPathOpt is defined only in static insert_overwrite case.
   *
   * @return (overwriteMode, isOverWriteTable, isOverWritePartition, staticOverwritePartitionPathOpt)
   */
  def deduceOverwriteConfig(sparkSession: SparkSession,
                            catalogTable: HoodieCatalogTable,
                            partitionSpec: Map[String, Option[String]],
                            extraOptions: Map[String, String]): (SaveMode, Boolean, Boolean, Option[String]) = {
    val combinedOpts: Map[String, String] = combineOptions(catalogTable, catalogTable.tableConfig, sparkSession.sqlContext.conf,
      defaultOpts = Map.empty, overridingOpts = extraOptions)
    val operation = combinedOpts.getOrElse(OPERATION.key, null)
    val isOverwriteOperation = operation != null &&
      (operation.equals(INSERT_OVERWRITE_OPERATION_OPT_VAL) || operation.equals(INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL))
    // If hoodie.datasource.overwrite.mode configured, respect it, otherwise respect spark.sql.sources.partitionOverwriteMode
    val hoodieOverwriteMode = combinedOpts.getOrElse(OVERWRITE_MODE.key,
      sparkSession.sqlContext.getConf(PARTITION_OVERWRITE_MODE.key)).toUpperCase()
    val isStaticOverwrite = !isOverwriteOperation && (hoodieOverwriteMode match {
      case "STATIC" => true
      case "DYNAMIC" => false
      case _ => throw new IllegalArgumentException("Config hoodie.datasource.overwrite.mode is illegal")
    })
    val isOverWriteTable = operation match {
      case INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL => true
      case INSERT_OVERWRITE_OPERATION_OPT_VAL => false
      case _ =>
        // There are two cases where we need use insert_overwrite_table
        // 1. NonPartitioned table always insert overwrite whole table
        // 2. static mode and no partition values specified
        catalogTable.partitionFields.isEmpty || (isStaticOverwrite && partitionSpec.isEmpty)
    }
    val overwriteMode = if (isOverWriteTable) SaveMode.Overwrite else SaveMode.Append
    val staticPartitions = if (isStaticOverwrite && !isOverWriteTable) {
      val fileIndex = HoodieFileIndex(sparkSession, catalogTable.metaClient, None, combinedOpts, FileStatusCache.getOrCreate(sparkSession))
      val partitionNameToType = catalogTable.partitionSchema.fields.map(field => (field.name, field.dataType)).toMap
      val staticPartitionValues = partitionSpec.filter(p => p._2.isDefined).mapValues(_.get)
      val predicates = staticPartitionValues.map { case (k, v) =>
        val partition = AttributeReference(k, partitionNameToType(k))()
        val value = HoodieSqlCommonUtils.castIfNeeded(Literal.create(v), partitionNameToType(k))
        EqualTo(partition, value)
      }.toSeq
      Option(fileIndex.getPartitionPaths(predicates).map(_.getPath).mkString(","))
    } else {
      Option.empty
    }
    (overwriteMode, isOverWriteTable, !isOverWriteTable, staticPartitions)
  }

  def buildHoodieDropPartitionsConfig(sparkSession: SparkSession,
                                      hoodieCatalogTable: HoodieCatalogTable,
                                      partitionsToDrop: String): Map[String, String] = {
    val partitionFields = hoodieCatalogTable.partitionFields.mkString(",")
    val tableConfig = hoodieCatalogTable.tableConfig

    val hiveSyncConfig = buildHiveSyncConfig(sparkSession, hoodieCatalogTable, tableConfig)

    val overridingOpts = Map(
      "path" -> hoodieCatalogTable.tableLocation,
      TBL_NAME.key -> hoodieCatalogTable.tableName,
      DATABASE_NAME.key -> hoodieCatalogTable.table.database,
      TABLE_TYPE.key -> hoodieCatalogTable.tableTypeName,
      OPERATION.key -> DataSourceWriteOptions.DELETE_PARTITION_OPERATION_OPT_VAL,
      PARTITIONS_TO_DELETE.key -> partitionsToDrop,
      RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
      PRECOMBINE_FIELD.key -> hoodieCatalogTable.preCombineKey.getOrElse(""),
      PARTITIONPATH_FIELD.key -> getPartitionPathFieldWriteConfig(
        tableConfig.getKeyGeneratorClassName, partitionFields, hoodieCatalogTable),
      HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
      HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME),
      HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME),
      HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
      HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> partitionFields,
      HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS)
    )

    combineOptions(hoodieCatalogTable, tableConfig, sparkSession.sqlContext.conf,
      defaultOpts = Map.empty, overridingOpts = overridingOpts)
  }

  def buildHoodieDeleteTableConfig(hoodieCatalogTable: HoodieCatalogTable,
                                   sparkSession: SparkSession): Map[String, String] = {
    val path = hoodieCatalogTable.tableLocation
    val tableConfig = hoodieCatalogTable.tableConfig
    val tableSchema = hoodieCatalogTable.tableSchema
    val partitionColumns = tableConfig.getPartitionFieldProp.split(",").map(_.toLowerCase(Locale.ROOT))
    val partitionSchema = StructType(tableSchema.filter(f => partitionColumns.contains(f.name)))

    val hiveSyncConfig = buildHiveSyncConfig(sparkSession, hoodieCatalogTable, tableConfig)

    val defaultOpts = Map(
      KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName,
      SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName,
      SqlKeyGenerator.PARTITION_SCHEMA -> partitionSchema.toDDL,
      HoodieSyncConfig.META_SYNC_ENABLED.key -> hiveSyncConfig.getString(HoodieSyncConfig.META_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key -> hiveSyncConfig.getString(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key),
      HiveSyncConfigHolder.HIVE_SYNC_MODE.key -> hiveSyncConfig.getStringOrDefault(HiveSyncConfigHolder.HIVE_SYNC_MODE, HiveSyncMode.HMS.name()),
      HoodieSyncConfig.META_SYNC_DATABASE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_DATABASE_NAME),
      HoodieSyncConfig.META_SYNC_TABLE_NAME.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_TABLE_NAME),
      HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE.key -> hiveSyncConfig.getBoolean(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE).toString,
      HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key -> hoodieCatalogTable.partitionFields.mkString(","),
      HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key -> hiveSyncConfig.getStringOrDefault(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS)
    )

    val overridingOpts = Map(
      "path" -> path,
      RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
      TBL_NAME.key -> tableConfig.getTableName,
      DATABASE_NAME.key -> hoodieCatalogTable.table.database,
      HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable,
      URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning,
      OPERATION.key -> DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL,
      PARTITIONPATH_FIELD.key -> getPartitionPathFieldWriteConfig(
        tableConfig.getKeyGeneratorClassName, tableConfig.getPartitionFieldProp, hoodieCatalogTable)
    )

    combineOptions(hoodieCatalogTable, tableConfig, sparkSession.sqlContext.conf,
      defaultOpts = defaultOpts, overridingOpts = overridingOpts)
  }

  def buildHiveSyncConfig(sparkSession: SparkSession,
                          hoodieCatalogTable: HoodieCatalogTable,
                          tableConfig: HoodieTableConfig,
                          extraOptions: Map[String, String] = Map.empty): HiveSyncConfig = {
    val combinedOpts = combineOptions(hoodieCatalogTable, tableConfig, sparkSession.sqlContext.conf,
      defaultOpts = Map.empty, overridingOpts = extraOptions)
    val props = new TypedProperties(toProperties(combinedOpts))

    // Enable the hive sync by default if spark have enable the hive metastore.
    val enableHive = isUsingHiveCatalog(sparkSession)
    val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig(props)
    hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_ENABLED.key, enableHive.toString)
    hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_ENABLED.key, enableHive.toString)
    hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_MODE.key, props.getString(HiveSyncConfigHolder.HIVE_SYNC_MODE.key, HiveSyncMode.HMS.name()))
    hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_BASE_PATH, hoodieCatalogTable.tableLocation)
    hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, props.getString(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key, HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue))
    hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME, hoodieCatalogTable.table.identifier.database.getOrElse("default"))
    hiveSyncConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME, hoodieCatalogTable.table.identifier.table)
    if (props.get(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key) != null) {
      hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS, props.getString(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key))
    }
    hiveSyncConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS, classOf[MultiPartKeysValueExtractor].getName)
    // This is hardcoded to true to ensure consistency as Spark syncs TIMESTAMP types as TIMESTAMP by default
    // via Spark's externalCatalog API, which is used by AlterHoodieTableCommand.
    hiveSyncConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE, "true")
    if (hiveSyncConfig.useBucketSync())
      hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC,
        HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key),
          props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key)))
    if (props.containsKey(HiveExternalCatalog.CREATED_SPARK_VERSION))
      hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_SPARK_VERSION,
        props.getString(HiveExternalCatalog.CREATED_SPARK_VERSION))
    hiveSyncConfig
  }
}

object ProvidesHoodieConfig {

  private val log = LoggerFactory.getLogger(getClass)

  // NOTE: PLEASE READ CAREFULLY BEFORE CHANGING
  //
  // Spark SQL operations configuration might be coming from a variety of diverse sources
  // that have to be ultimately combined under clear and consistent process:
  //
  //    - Default: specify default values preferred by the feature/component (could be
  //      overridden by any source)
  //
  //    - Overriding: specify mandatory values required for the feature/component (could NOT be
  //      overridden by any source)s
  //
  def combineOptions(catalogTable: HoodieCatalogTable,
                     tableConfig: HoodieTableConfig,
                     sqlConf: SQLConf,
                     defaultOpts: Map[String, String],
                     overridingOpts: Map[String, String] = Map.empty): Map[String, String] = {
    // NOTE: Properties are merged in the following order of priority (first has the highest priority, last has the
    //       lowest, which is inverse to the ordering in the code):
    //          1. (Extra) Option overrides
    //          2. Spark SQL configs
    //          3. Persisted Hudi's Table configs
    //          4. Table's properties in Spark Catalog
    //          5. Global DFS properties
    //          6. (Feature-specific) Default values
    filterNullValues(defaultOpts) ++
      DFSPropertiesConfiguration.getGlobalProps.asScala.toMap ++
      // NOTE: Catalog table provided t/h `TBLPROPERTIES` clause might contain Spark SQL specific
      //       properties that need to be mapped into Hudi's conventional ones
      mapSqlOptionsToDataSourceWriteConfigs(catalogTable.catalogProperties) ++
      tableConfig.getProps.asScala.toMap ++
      filterHoodieConfigs(sqlConf.getAllConfs) ++
      filterNullValues(overridingOpts)
  }

  /**
   * @param tableConfigKeyGeneratorClassName     key generator class name in the table config.
   * @param partitionFieldNamesWithoutKeyGenType partition field names without key generator types
   *                                             from the table config.
   * @param catalogTable                         HoodieCatalogTable instance to fetch table properties.
   * @return the write config value to set for "hoodie.datasource.write.partitionpath.field".
   */
  def getPartitionPathFieldWriteConfig(tableConfigKeyGeneratorClassName: String,
                                       partitionFieldNamesWithoutKeyGenType: String,
                                       catalogTable: HoodieCatalogTable): String = {
    if (StringUtils.isNullOrEmpty(tableConfigKeyGeneratorClassName)) {
      partitionFieldNamesWithoutKeyGenType
    } else {
      val writeConfigPartitionField = catalogTable.catalogProperties.get(PARTITIONPATH_FIELD.key())
      val keyGenClass = ReflectionUtils.getClass(tableConfigKeyGeneratorClassName)
      if (classOf[CustomKeyGenerator].equals(keyGenClass)
        || classOf[CustomAvroKeyGenerator].equals(keyGenClass)) {
        val partitionFieldWithKeyGenType = HoodieTableConfig.getPartitionFieldPropForKeyGenerator(catalogTable.tableConfig).orElse("")
        if (writeConfigPartitionField.isDefined) {
          writeConfigPartitionField.get
        } else if (StringUtils.nonEmpty(partitionFieldWithKeyGenType)) {
          partitionFieldWithKeyGenType
        } else {
          log.warn("Write config \"hoodie.datasource.write.partitionpath.field\" is not set for "
            + "custom key generator. This may fail the write operation.")
          partitionFieldNamesWithoutKeyGenType
        }
      } else {
        partitionFieldNamesWithoutKeyGenType
      }
    }
  }

  def isSchemaEvolutionEnabled(sparkSession: SparkSession): Boolean =
    sparkSession.sessionState.conf.getConfString(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key,
      DFSPropertiesConfiguration.getGlobalProps.getString(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(),
        HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.defaultValue.toString)
    ).toBoolean

  private def filterNullValues(opts: Map[String, String]): Map[String, String] =
    opts.filter { case (_, v) => v != null }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy