All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.HoodieSchemaUtils.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi

import org.apache.hudi.HoodieSparkSqlWriter.{CANONICALIZE_SCHEMA, SQL_MERGE_INTO_WRITES}
import org.apache.hudi.avro.AvroSchemaUtils.{checkSchemaCompatible, checkValidEvolution, isCompatibleProjectionOf, isSchemaCompatible}
import org.apache.hudi.avro.HoodieAvroUtils
import org.apache.hudi.avro.HoodieAvroUtils.removeMetadataFields
import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieConfig, TypedProperties}
import org.apache.hudi.common.model.HoodieRecord
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.exception.SchemaCompatibilityException
import org.apache.hudi.internal.schema.InternalSchema
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter
import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils
import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils.reconcileSchemaRequirements

import org.apache.avro.Schema
import org.apache.spark.sql.types.StructType
import org.slf4j.LoggerFactory

import scala.collection.JavaConverters._

/**
 * Util methods for Schema evolution in Hudi
 */
object HoodieSchemaUtils {
  private val log = LoggerFactory.getLogger(getClass)

  /**
   * get latest internalSchema from table
   *
   * @param config          instance of {@link HoodieConfig}
   * @param tableMetaClient instance of HoodieTableMetaClient
   * @return Option of InternalSchema. Will always be empty if schema on read is disabled
   */
  def getLatestTableInternalSchema(config: HoodieConfig,
                                   tableMetaClient: HoodieTableMetaClient): Option[InternalSchema] = {
    if (!config.getBooleanOrDefault(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED)) {
      None
    } else {
      try {
        val tableSchemaResolver = new TableSchemaResolver(tableMetaClient)
        val internalSchemaOpt = tableSchemaResolver.getTableInternalSchemaFromCommitMetadata
        if (internalSchemaOpt.isPresent) Some(internalSchemaOpt.get()) else None
      } catch {
        case _: Exception => None
      }
    }
  }

  /**
   * Deduces writer's schema based on
   * 
    *
  • Source's schema
  • *
  • Target table's schema (including Hudi's [[InternalSchema]] representation)
  • *
*/ def deduceWriterSchema(sourceSchema: Schema, latestTableSchemaOpt: Option[Schema], internalSchemaOpt: Option[InternalSchema], opts: Map[String, String]): Schema = { latestTableSchemaOpt match { // If table schema is empty, then we use the source schema as a writer's schema. case None => AvroInternalSchemaConverter.fixNullOrdering(sourceSchema) // Otherwise, we need to make sure we reconcile incoming and latest table schemas case Some(latestTableSchemaWithMetaFields) => // NOTE: Meta-fields will be unconditionally injected by Hudi writing handles, for the sake of deducing proper writer schema // we're stripping them to make sure we can perform proper analysis // add call to fix null ordering to ensure backwards compatibility val latestTableSchema = AvroInternalSchemaConverter.fixNullOrdering(removeMetadataFields(latestTableSchemaWithMetaFields)) // Before validating whether schemas are compatible, we need to "canonicalize" source's schema // relative to the table's one, by doing a (minor) reconciliation of the nullability constraints: // for ex, if in incoming schema column A is designated as non-null, but it's designated as nullable // in the table's one we want to proceed aligning nullability constraints w/ the table's schema // Also, we promote types to the latest table schema if possible. val shouldCanonicalizeSchema = opts.getOrElse(CANONICALIZE_SCHEMA.key, CANONICALIZE_SCHEMA.defaultValue.toString).toBoolean val shouldReconcileSchema = opts.getOrElse(DataSourceWriteOptions.RECONCILE_SCHEMA.key(), DataSourceWriteOptions.RECONCILE_SCHEMA.defaultValue().toString).toBoolean val canonicalizedSourceSchema = if (shouldCanonicalizeSchema) { canonicalizeSchema(sourceSchema, latestTableSchema, opts, !shouldReconcileSchema) } else { AvroInternalSchemaConverter.fixNullOrdering(sourceSchema) } if (shouldReconcileSchema) { deduceWriterSchemaWithReconcile(sourceSchema, canonicalizedSourceSchema, latestTableSchema, internalSchemaOpt, opts) } else { deduceWriterSchemaWithoutReconcile(sourceSchema, canonicalizedSourceSchema, latestTableSchema, opts) } } } /** * Deducing with disabled reconciliation. * We have to validate that the source's schema is compatible w/ the table's latest schema, * such that we're able to read existing table's records using [[sourceSchema]]. */ private def deduceWriterSchemaWithoutReconcile(sourceSchema: Schema, canonicalizedSourceSchema: Schema, latestTableSchema: Schema, opts: Map[String, String]): Schema = { // NOTE: In some cases we need to relax constraint of incoming dataset's schema to be compatible // w/ the table's one and allow schemas to diverge. This is required in cases where // partial updates will be performed (for ex, `MERGE INTO` Spark SQL statement) and as such // only incoming dataset's projection has to match the table's schema, and not the whole one val mergeIntoWrites = opts.getOrElse(SQL_MERGE_INTO_WRITES.key(), SQL_MERGE_INTO_WRITES.defaultValue.toString).toBoolean val shouldValidateSchemasCompatibility = opts.getOrElse(HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.key, HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.defaultValue).toBoolean val allowAutoEvolutionColumnDrop = opts.getOrElse(HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.key, HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.defaultValue).toBoolean val setNullForMissingColumns = opts.getOrElse(DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.key(), DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.defaultValue).toBoolean if (!mergeIntoWrites && !shouldValidateSchemasCompatibility && !allowAutoEvolutionColumnDrop) { // Default behaviour val reconciledSchema = if (setNullForMissingColumns) { AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema, latestTableSchema, setNullForMissingColumns) } else { canonicalizedSourceSchema } checkValidEvolution(reconciledSchema, latestTableSchema) reconciledSchema } else { // If it's merge into writes, we don't check for projection nor schema compatibility. Writers down the line will take care of it. // Or it's not merge into writes, and we don't validate schema, but we allow to drop columns automatically. // Or it's not merge into writes, we validate schema, and schema is compatible. if (shouldValidateSchemasCompatibility) { checkSchemaCompatible(latestTableSchema, canonicalizedSourceSchema, true, allowAutoEvolutionColumnDrop, java.util.Collections.emptySet()) } canonicalizedSourceSchema } } /** * Deducing with enabled reconciliation. * Marked as Deprecated. */ private def deduceWriterSchemaWithReconcile(sourceSchema: Schema, canonicalizedSourceSchema: Schema, latestTableSchema: Schema, internalSchemaOpt: Option[InternalSchema], opts: Map[String, String]): Schema = { internalSchemaOpt match { case Some(internalSchema) => // Apply schema evolution, by auto-merging write schema and read schema val setNullForMissingColumns = opts.getOrElse(HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS.key(), HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS.defaultValue()).toBoolean val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema, internalSchema, setNullForMissingColumns) val evolvedSchema = AvroInternalSchemaConverter.convert(mergedInternalSchema, latestTableSchema.getFullName) val shouldRemoveMetaDataFromInternalSchema = sourceSchema.getFields().asScala.filter(f => f.name().equalsIgnoreCase(HoodieRecord.RECORD_KEY_METADATA_FIELD)).isEmpty if (shouldRemoveMetaDataFromInternalSchema) HoodieAvroUtils.removeMetadataFields(evolvedSchema) else evolvedSchema case None => // In case schema reconciliation is enabled we will employ (legacy) reconciliation // strategy to produce target writer's schema (see definition below) val (reconciledSchema, isCompatible) = reconcileSchemasLegacy(latestTableSchema, canonicalizedSourceSchema) // NOTE: In some cases we need to relax constraint of incoming dataset's schema to be compatible // w/ the table's one and allow schemas to diverge. This is required in cases where // partial updates will be performed (for ex, `MERGE INTO` Spark SQL statement) and as such // only incoming dataset's projection has to match the table's schema, and not the whole one val shouldValidateSchemasCompatibility = opts.getOrElse(HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.key, HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.defaultValue).toBoolean if (!shouldValidateSchemasCompatibility || isCompatible) { reconciledSchema } else { log.error( s"""Failed to reconcile incoming batch schema with the table's one. |Incoming schema ${sourceSchema.toString(true)} |Incoming schema (canonicalized) ${canonicalizedSourceSchema.toString(true)} |Table's schema ${latestTableSchema.toString(true)} |""".stripMargin) throw new SchemaCompatibilityException("Failed to reconcile incoming schema with the table's one") } } } def deduceWriterSchema(sourceSchema: Schema, latestTableSchemaOpt: org.apache.hudi.common.util.Option[Schema], internalSchemaOpt: org.apache.hudi.common.util.Option[InternalSchema], props: TypedProperties): Schema = { deduceWriterSchema(sourceSchema, HoodieConversionUtils.toScalaOption(latestTableSchemaOpt), HoodieConversionUtils.toScalaOption(internalSchemaOpt), HoodieConversionUtils.fromProperties(props)) } /** * Canonicalizes [[sourceSchema]] by reconciling it w/ [[latestTableSchema]] in following * *
    *
  1. Nullability: making sure that nullability of the fields in the source schema is matching * that of the latest table's ones
  2. *
* * TODO support casing reconciliation */ private def canonicalizeSchema(sourceSchema: Schema, latestTableSchema: Schema, opts : Map[String, String], shouldReorderColumns: Boolean): Schema = { reconcileSchemaRequirements(sourceSchema, latestTableSchema, shouldReorderColumns) } private def reconcileSchemasLegacy(tableSchema: Schema, newSchema: Schema): (Schema, Boolean) = { // Legacy reconciliation implements following semantic // - In case new-schema is a "compatible" projection of the existing table's one (projection allowing // permitted type promotions), table's schema would be picked as (reconciled) writer's schema; // - Otherwise, we'd fall back to picking new (batch's) schema as a writer's schema; // // Philosophically, such semantic aims at always choosing a "wider" schema, ie the one containing // the other one (schema A contains schema B, if schema B is a projection of A). This enables us, // to always "extend" the schema during schema evolution and hence never lose the data (when, for ex // existing column is being dropped in a new batch) // // NOTE: By default Hudi doesn't allow automatic schema evolution to drop the columns from the target // table. However, when schema reconciliation is turned on, we would allow columns to be dropped // in the incoming batch (as these would be reconciled in anyway) if (isCompatibleProjectionOf(tableSchema, newSchema)) { // Picking table schema as a writer schema we need to validate that we'd be able to // rewrite incoming batch's data (written in new schema) into it (tableSchema, isSchemaCompatible(newSchema, tableSchema)) } else { // Picking new schema as a writer schema we need to validate that we'd be able to // rewrite table's data into it (newSchema, isSchemaCompatible(tableSchema, newSchema)) } } /** * Check if the partition schema fields order matches the table schema fields order. * * @param tableSchema The table schema * @param partitionFields The partition fields */ def checkPartitionSchemaOrder(tableSchema: StructType, partitionFields: Seq[String]): Unit = { val tableSchemaFields = tableSchema.fields.map(_.name) // It is not allowed to specify partition columns when the table schema is not defined. // https://spark.apache.org/docs/latest/sql-error-conditions.html#specify_partition_is_not_allowed if (tableSchemaFields.isEmpty && partitionFields.nonEmpty) { throw new IllegalArgumentException("It is not allowed to specify partition columns when the table schema is not defined.") } // Filter the table schema fields to get the partition field names in order val tableSchemaPartitionFields = tableSchemaFields.filter(partitionFields.contains).toSeq if (tableSchemaPartitionFields != partitionFields) { throw new IllegalArgumentException(s"Partition schema fields order does not match the table schema fields order," + s" tableSchemaFields: $tableSchemaPartitionFields, partitionFields: $partitionFields.") } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy