org.apache.hudi.HoodieWriterUtils.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-common_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.AutoRecordKeyGenerationUtils.shouldAutoGenerateRecordKeys
import org.apache.hudi.DataSourceOptionsHelper.allAlternatives
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.common.config.{DFSPropertiesConfiguration, HoodieCommonConfig, HoodieConfig, TypedProperties}
import org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE
import org.apache.hudi.common.model.{HoodieRecord, WriteOperationType}
import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.config.HoodieWriteConfig.{RECORD_MERGE_MODE, SPARK_SQL_MERGE_INTO_PREPPED_KEY}
import org.apache.hudi.exception.HoodieException
import org.apache.hudi.hive.HiveSyncConfigHolder
import org.apache.hudi.keygen.{NonpartitionedKeyGenerator, SimpleKeyGenerator}
import org.apache.hudi.keygen.constant.KeyGeneratorType
import org.apache.hudi.sync.common.HoodieSyncConfig
import org.apache.hudi.util.SparkKeyGenUtils

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.hudi.command.{MergeIntoKeyGenerator, SqlKeyGenerator}
import org.slf4j.LoggerFactory

import scala.collection.JavaConverters._

/**
 * WriterUtils to assist in write path in Datasource and tests.
 */
object HoodieWriterUtils {

  private val log = LoggerFactory.getLogger(getClass)

  /**
   * Add default options for unspecified write options keys.
   */
  def parametersWithWriteDefaults(parameters: Map[String, String]): Map[String, String] = {
    val globalProps = DFSPropertiesConfiguration.getGlobalProps.asScala
    val props = TypedProperties.fromMap(parameters.asJava)
    val hoodieConfig: HoodieConfig = new HoodieConfig(props)
    hoodieConfig.setDefaultValue(OPERATION)
    hoodieConfig.setDefaultValue(TABLE_TYPE)
    hoodieConfig.setDefaultValue(PRECOMBINE_FIELD)
    hoodieConfig.setDefaultValue(KEYGENERATOR_CLASS_NAME)
    hoodieConfig.setDefaultValue(ENABLE)
    hoodieConfig.setDefaultValue(COMMIT_METADATA_KEYPREFIX)
    hoodieConfig.setDefaultValue(INSERT_DROP_DUPS)
    hoodieConfig.setDefaultValue(STREAMING_RETRY_CNT)
    hoodieConfig.setDefaultValue(STREAMING_RETRY_INTERVAL_MS)
    hoodieConfig.setDefaultValue(STREAMING_IGNORE_FAILED_BATCH)
    hoodieConfig.setDefaultValue(META_SYNC_CLIENT_TOOL_CLASS_NAME)
    hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SYNC_ENABLED)
    hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_ENABLED)
    hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_DATABASE_NAME)
    hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_TABLE_NAME)
    hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT)
    hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_USER)
    hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_PASS)
    hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_URL)
    hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS)
    hoodieConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS)
    hoodieConfig.setDefaultValue(HIVE_STYLE_PARTITIONING)
    hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_USE_JDBC)
    hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE)
    hoodieConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE)
    hoodieConfig.setDefaultValue(ASYNC_COMPACT_ENABLE)
    hoodieConfig.setDefaultValue(INLINE_CLUSTERING_ENABLE)
    hoodieConfig.setDefaultValue(ASYNC_CLUSTERING_ENABLE)
    hoodieConfig.setDefaultValue(ENABLE_ROW_WRITER)
    hoodieConfig.setDefaultValue(RECONCILE_SCHEMA)
    hoodieConfig.setDefaultValue(DROP_PARTITION_COLUMNS)
    hoodieConfig.setDefaultValue(KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED)
    Map() ++ hoodieConfig.getProps.asScala ++ globalProps ++ DataSourceOptionsHelper.translateConfigurations(parameters)
  }

  /**
   * Determines whether writes need to take prepped path or regular non-prepped path.
   * - For spark-sql writes (UPDATES, DELETES), we could use prepped flow due to the presences of meta fields.
   * - For pkless tables, if incoming df has meta fields, we could use prepped flow.
   *
   * @param hoodieConfig hoodie config of interest.
   * @param parameters   raw parameters.
   * @param operation    operation type.
   * @param df           incoming dataframe
   * @return true if prepped writes, false otherwise.
   */
  def canDoPreppedWrites(hoodieConfig: HoodieConfig, parameters: Map[String, String], operation: WriteOperationType, df: Dataset[Row]): Boolean = {
    var isPrepped = false
    if (shouldAutoGenerateRecordKeys(parameters)
      && parameters.getOrElse(SPARK_SQL_WRITES_PREPPED_KEY, "false").equals("false")
      && parameters.getOrElse(SPARK_SQL_MERGE_INTO_PREPPED_KEY, "false").equals("false")
      && df.schema.fieldNames.contains(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
      // with pk less table, writes using spark-ds writer can potentially use the prepped path if meta fields are present in the incoming df.
      if (operation == WriteOperationType.UPSERT) {
        log.warn("Changing operation type to UPSERT PREPPED for pk less table upserts ")
        isPrepped = true
      } else if (operation == WriteOperationType.DELETE) {
        log.warn("Changing operation type to DELETE PREPPED for pk less table deletes ")
        isPrepped = true
      }
    }
    isPrepped
  }

  /**
   * Fetch params by translating alternatives if any. Do not set any default as this method is intended to be called
   * before validation.
   *
   * @param parameters hash map of parameters.
   * @return hash map of raw with translated parameters.
   */
  def getParamsWithAlternatives(parameters: Map[String, String]): Map[String, String] = {
    val globalProps = DFSPropertiesConfiguration.getGlobalProps.asScala
    val props = TypedProperties.fromMap(parameters.asJava)
    val hoodieConfig: HoodieConfig = new HoodieConfig(props)
    // do not set any default as this is called before validation.
    Map() ++ hoodieConfig.getProps.asScala ++ globalProps ++ DataSourceOptionsHelper.translateConfigurations(parameters)
  }

  /**
   * Get the partition columns to stored to hoodie.properties.
   */
  def getPartitionColumns(parameters: Map[String, String]): String = {
    SparkKeyGenUtils.getPartitionColumns(TypedProperties.fromMap(parameters.asJava))
  }

  def convertMapToHoodieConfig(parameters: Map[String, String]): HoodieConfig = {
    val properties = TypedProperties.fromMap(parameters.asJava)
    new HoodieConfig(properties)
  }

  def getOriginKeyGenerator(parameters: Map[String, String]): String = {
    val kg = parameters.getOrElse(KEYGENERATOR_CLASS_NAME.key(), null)
    if (classOf[SqlKeyGenerator].getCanonicalName == kg || classOf[MergeIntoKeyGenerator].getCanonicalName == kg) {
      parameters.getOrElse(SqlKeyGenerator.ORIGINAL_KEYGEN_CLASS_NAME, null)
    } else {
      kg
    }
  }

  private final val EXPRESSION_PAYLOAD_CLASS_NAME = "org.apache.spark.sql.hudi.command.payload.ExpressionPayload"
  private final val VALIDATE_DUPLICATE_KEY_PAYLOAD_CLASS_NAME = "org.apache.spark.sql.hudi.command.ValidateDuplicateKeyPayload"

  /**
   * Logic to skip validation of configs vs table configs
   * In nearly all cases we should make sure that the input config matches the table config
   * But there are a few times where it is allowed to be different
   */
  private def shouldIgnoreConfig(key: String, value: String, params: Map[String, String]): Boolean = {
    var ignoreConfig = false
    // Base file format can change between writes, so ignore it.
    ignoreConfig = ignoreConfig || HoodieTableConfig.BASE_FILE_FORMAT.key.equals(key)

    //expression payload will never be the table config so skip validation of merge configs
    ignoreConfig = ignoreConfig || (params.getOrElse(PAYLOAD_CLASS_NAME.key(), "").equals(EXPRESSION_PAYLOAD_CLASS_NAME)
      && (key.equals(PAYLOAD_CLASS_NAME.key()) || key.equals(HoodieTableConfig.PAYLOAD_CLASS_NAME.key())
      || key.equals(RECORD_MERGE_MODE.key())
      || key.equals(RECORD_MERGE_STRATEGY_ID.key())))

    //don't validate the payload only in the case that insert into is using fallback to some legacy configs
    ignoreConfig = ignoreConfig || (key.equals(PAYLOAD_CLASS_NAME.key()) && value.equals(VALIDATE_DUPLICATE_KEY_PAYLOAD_CLASS_NAME))

    ignoreConfig
  }

  def validateTableConfig(spark: SparkSession, params: Map[String, String],
                          tableConfig: HoodieConfig): Unit = {
    validateTableConfig(spark, params, tableConfig, false)
  }

  /**
   * Detects conflicts between new parameters and existing table configurations
   */
  def validateTableConfig(spark: SparkSession, params: Map[String, String],
                          tableConfig: HoodieConfig, isOverWriteMode: Boolean): Unit = {
    // If Overwrite is set as save mode, we don't need to do table config validation.
    if (!isOverWriteMode) {
      val resolver = spark.sessionState.conf.resolver
      val diffConfigs = StringBuilder.newBuilder
      params.foreach { case (key, value) =>
        if (!shouldIgnoreConfig(key, value, params)) {
          val existingValue = getStringFromTableConfigWithAlternatives(tableConfig, key)
          if (null != existingValue && !resolver(existingValue, value)) {
            diffConfigs.append(s"$key:\t$value\t${tableConfig.getString(key)}\n")
          }
        }
      }

      if (null != tableConfig) {
        val datasourceRecordKey = params.getOrElse(RECORDKEY_FIELD.key(), null)
        val tableConfigRecordKey = tableConfig.getString(HoodieTableConfig.RECORDKEY_FIELDS)
        if ((null != datasourceRecordKey && null != tableConfigRecordKey
          && datasourceRecordKey != tableConfigRecordKey) || (null != datasourceRecordKey && datasourceRecordKey.nonEmpty
          && tableConfigRecordKey == null)) {
          // if both are non null, they should match.
          // if incoming record key is non empty, table config should also be non empty.
          diffConfigs.append(s"RecordKey:\t$datasourceRecordKey\t$tableConfigRecordKey\n")
        }

        val datasourcePreCombineKey = params.getOrElse(PRECOMBINE_FIELD.key(), null)
        val tableConfigPreCombineKey = tableConfig.getString(HoodieTableConfig.PRECOMBINE_FIELD)
        if (null != datasourcePreCombineKey && null != tableConfigPreCombineKey && datasourcePreCombineKey != tableConfigPreCombineKey) {
          diffConfigs.append(s"PreCombineKey:\t$datasourcePreCombineKey\t$tableConfigPreCombineKey\n")
        }

        val datasourceKeyGen = getOriginKeyGenerator(params)
        val tableConfigKeyGen = KeyGeneratorType.getKeyGeneratorClassName(tableConfig)
        if (null != datasourceKeyGen && null != tableConfigKeyGen
          && datasourceKeyGen != tableConfigKeyGen) {
          diffConfigs.append(s"KeyGenerator:\t$datasourceKeyGen\t$tableConfigKeyGen\n")
        }

        // Please note that the validation of partition path fields needs the key generator class
        // for the table, since the custom key generator expects a different format of
        // the value of the write config "hoodie.datasource.write.partitionpath.field"
        // e.g., "col:simple,ts:timestamp", whereas the table config "hoodie.table.partition.fields"
        // in hoodie.properties stores "col,ts".
        // The "params" here may only contain the write config of partition path field,
        // so we need to pass in the validated key generator class name.
        val validatedKeyGenClassName = if (tableConfigKeyGen != null) {
          Option(tableConfigKeyGen)
        } else if (datasourceKeyGen != null) {
          Option(datasourceKeyGen)
        } else {
          None
        }
        val datasourcePartitionFields = params.getOrElse(PARTITIONPATH_FIELD.key(), null)
        val currentPartitionFields = if (datasourcePartitionFields == null) {
          null
        } else {
          SparkKeyGenUtils.getPartitionColumns(validatedKeyGenClassName, TypedProperties.fromMap(params.asJava))
        }
        val tableConfigPartitionFields = HoodieTableConfig.getPartitionFieldProp(tableConfig).orElse(null)
        if (null != datasourcePartitionFields && null != tableConfigPartitionFields
          && currentPartitionFields != tableConfigPartitionFields) {
          diffConfigs.append(s"PartitionPath:\t$currentPartitionFields\t$tableConfigPartitionFields\n")
        }
      }

      if (diffConfigs.nonEmpty) {
        diffConfigs.insert(0, "\nConfig conflict(key\tcurrent value\texisting value):\n")
        throw new HoodieException(diffConfigs.toString.trim)
      }
    }

    // Check schema evolution for bootstrap table.
    // now we do not support bootstrap table.
    if (params.get(OPERATION.key).contains(BOOTSTRAP_OPERATION_OPT_VAL)
      && params.getOrElse(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key(), "false").toBoolean) {
      throw new HoodieException(String
        .format("now schema evolution cannot support bootstrap table, pls set %s to false", HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE.key()))
    }
  }

  /**
   * Detects conflicts between datasourceKeyGen and existing table configuration keyGen
   */
  def validateKeyGeneratorConfig(datasourceKeyGen: String, tableConfig: HoodieConfig): Unit = {
    val diffConfigs = StringBuilder.newBuilder

    if (null != tableConfig) {
      val tableConfigKeyGen = KeyGeneratorType.getKeyGeneratorClassName(tableConfig)
      if (null != tableConfigKeyGen && null != datasourceKeyGen) {
        val nonPartitionedTableConfig = tableConfigKeyGen.equals(classOf[NonpartitionedKeyGenerator].getCanonicalName)
        val simpleKeyDataSourceConfig = datasourceKeyGen.equals(classOf[SimpleKeyGenerator].getCanonicalName)
        if (nonPartitionedTableConfig && simpleKeyDataSourceConfig) {
          diffConfigs.append(s"KeyGenerator:\t$datasourceKeyGen\t$tableConfigKeyGen\n")
        }
      }
    }

    if (diffConfigs.nonEmpty) {
      diffConfigs.insert(0, "\nConfig conflict(key\tcurrent value\texisting value):\n")
      throw new HoodieException(diffConfigs.toString.trim)
    }
  }

  private def getStringFromTableConfigWithAlternatives(tableConfig: HoodieConfig, key: String): String = {
    if (null == tableConfig) {
      null
    } else {
      if (allAlternatives.contains(key)) {
        tableConfig.getString(allAlternatives(key))
      } else {
        tableConfig.getString(key)
      }
    }
  }

  private val sparkDatasourceConfigsToTableConfigsMap = Map(
    TABLE_NAME -> HoodieTableConfig.NAME,
    TABLE_TYPE -> HoodieTableConfig.TYPE,
    PRECOMBINE_FIELD -> HoodieTableConfig.PRECOMBINE_FIELD,
    PARTITIONPATH_FIELD -> HoodieTableConfig.PARTITION_FIELDS,
    RECORDKEY_FIELD -> HoodieTableConfig.RECORDKEY_FIELDS,
    PAYLOAD_CLASS_NAME -> HoodieTableConfig.PAYLOAD_CLASS_NAME,
    RECORD_MERGE_STRATEGY_ID -> HoodieTableConfig.RECORD_MERGE_STRATEGY_ID,
    RECORD_MERGE_MODE -> HoodieTableConfig.RECORD_MERGE_MODE
  )

  def mappingSparkDatasourceConfigsToTableConfigs(options: Map[String, String]): Map[String, String] = {
    val includingTableConfigs = scala.collection.mutable.Map() ++ options
    sparkDatasourceConfigsToTableConfigsMap.foreach(kv => {
      if (options.contains(kv._1.key)) {
        includingTableConfigs(kv._2.key) = options(kv._1.key)
        includingTableConfigs.remove(kv._1.key)
      }
    })
    includingTableConfigs.toMap
  }
}