za.co.absa.pramen.extras.avro.AvroUtils.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pramen-extras_2.13 Show documentation
Batch data pipeline management tool
The newest version!
/*
 * Copyright 2022 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.pramen.extras.avro

import org.apache.avro.{JsonProperties, Schema}
import org.apache.spark.sql.avro.SchemaConverters.toAvroType
import org.apache.spark.sql.types.DataType

import scala.collection.JavaConverters._

object AvroUtils {

  /**
    * Converts Spark Schema to Avro schema according to Spark's mapping
    *
    * @param sparkSchema - Spark schema (df.schema)
    * @return Avro schema
    */
  def convertSparkToAvroSchema(sparkSchema: DataType): Schema = {
    toAvroType(sparkSchema, nullable = false)
  }

  /**
    * Fixes union type ordering and default values for nullable fields for Avro schemas generated by Spark.
    *
    * It does the following:
    * - Reorders union that have null as an option so that null goes first
    * - Adds the default value of null for nullable fields
    * This makes nullable fields forward and backward compatible/
    *
    * Example:
    *   Input:   {"name":"my_field","type":["string","null"]}
    *   Output:  {"name":"my_field","type":["null","string"],"default":null}
    *
    * Based on the implementation in Hyperdrive:
    * https://github.com/AbsaOSS/hyperdrive/blob/79ce14670c96c954bb0fca152e4dddd7809de491/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/utils/AbrisConfigUtil.scala#L125
    *
    * @param schema an Avro schema generated by Spark converters
    * @return Avro schema with nullable types fixed
    */
  def fixNullableFields(schema: Schema): Schema = {

    def fixNullableFieldsInner(schema: Schema, prefix: String): Schema = {
      val parent = if (prefix.isEmpty) "" else s"$prefix."

      schema.getType match {
        case Schema.Type.UNION  =>
          val newUnionTypes = schema.getTypes.asScala.map(dataType =>
            fixNullableFieldsInner(dataType, prefix)
          )
          Schema.createUnion(newUnionTypes.asJava)
        case Schema.Type.RECORD =>
          val newFields = schema.getFields.asScala.map(field => {
            val fullFieldName = s"$parent${field.name()}"
            val newSchema = fixNullableFieldsInner(field.schema(), fullFieldName)
            val defaultValue = if (needsReordering(field.schema())) JsonProperties.NULL_VALUE else field.defaultVal()
            val newSchemaReordered = reorderUnionTypesForNullable(newSchema)
            new Schema.Field(field.name(), newSchemaReordered, field.doc(), defaultValue, field.order())
          })
          Schema.createRecord(schema.getName, schema.getDoc, schema.getNamespace, schema.isError, newFields.asJava)
        case Schema.Type.ARRAY  =>
          val newSchema = fixNullableFieldsInner(schema.getElementType, prefix)
          Schema.createArray(newSchema)
        case Schema.Type.MAP    =>
          val newSchema = fixNullableFieldsInner(schema.getValueType, prefix)
          Schema.createMap(newSchema)
        case _                  => schema
      }
    }

    fixNullableFieldsInner(schema, "")
  }

  private def needsReordering(schema: Schema): Boolean = {
    if (schema.getType == Schema.Type.UNION) {
      val schemaTypes = schema.getTypes.asScala

      schemaTypes.size == 2 &&
        schemaTypes.head.getType != Schema.Type.NULL &&
        schemaTypes(1).getType == Schema.Type.NULL
    } else {
      false
    }
  }

  private def reorderUnionTypesForNullable(schema: Schema): Schema = {
    if (needsReordering(schema)) {
      val schemaTypes = schema.getTypes.asScala

      Schema.createUnion(Schema.create(Schema.Type.NULL), schemaTypes.head)
    } else {
      schema
    }
  }
}