za.co.absa.pramen.extras.avro.AvroUtils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pramen-extras_2.13 Show documentation
Show all versions of pramen-extras_2.13 Show documentation
Batch data pipeline management tool
The newest version!
/*
* Copyright 2022 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package za.co.absa.pramen.extras.avro
import org.apache.avro.{JsonProperties, Schema}
import org.apache.spark.sql.avro.SchemaConverters.toAvroType
import org.apache.spark.sql.types.DataType
import scala.collection.JavaConverters._
object AvroUtils {
/**
* Converts Spark Schema to Avro schema according to Spark's mapping
*
* @param sparkSchema - Spark schema (df.schema)
* @return Avro schema
*/
def convertSparkToAvroSchema(sparkSchema: DataType): Schema = {
toAvroType(sparkSchema, nullable = false)
}
/**
* Fixes union type ordering and default values for nullable fields for Avro schemas generated by Spark.
*
* It does the following:
* - Reorders union that have null as an option so that null goes first
* - Adds the default value of null for nullable fields
* This makes nullable fields forward and backward compatible/
*
* Example:
* Input: {"name":"my_field","type":["string","null"]}
* Output: {"name":"my_field","type":["null","string"],"default":null}
*
* Based on the implementation in Hyperdrive:
* https://github.com/AbsaOSS/hyperdrive/blob/79ce14670c96c954bb0fca152e4dddd7809de491/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/utils/AbrisConfigUtil.scala#L125
*
* @param schema an Avro schema generated by Spark converters
* @return Avro schema with nullable types fixed
*/
def fixNullableFields(schema: Schema): Schema = {
def fixNullableFieldsInner(schema: Schema, prefix: String): Schema = {
val parent = if (prefix.isEmpty) "" else s"$prefix."
schema.getType match {
case Schema.Type.UNION =>
val newUnionTypes = schema.getTypes.asScala.map(dataType =>
fixNullableFieldsInner(dataType, prefix)
)
Schema.createUnion(newUnionTypes.asJava)
case Schema.Type.RECORD =>
val newFields = schema.getFields.asScala.map(field => {
val fullFieldName = s"$parent${field.name()}"
val newSchema = fixNullableFieldsInner(field.schema(), fullFieldName)
val defaultValue = if (needsReordering(field.schema())) JsonProperties.NULL_VALUE else field.defaultVal()
val newSchemaReordered = reorderUnionTypesForNullable(newSchema)
new Schema.Field(field.name(), newSchemaReordered, field.doc(), defaultValue, field.order())
})
Schema.createRecord(schema.getName, schema.getDoc, schema.getNamespace, schema.isError, newFields.asJava)
case Schema.Type.ARRAY =>
val newSchema = fixNullableFieldsInner(schema.getElementType, prefix)
Schema.createArray(newSchema)
case Schema.Type.MAP =>
val newSchema = fixNullableFieldsInner(schema.getValueType, prefix)
Schema.createMap(newSchema)
case _ => schema
}
}
fixNullableFieldsInner(schema, "")
}
private def needsReordering(schema: Schema): Boolean = {
if (schema.getType == Schema.Type.UNION) {
val schemaTypes = schema.getTypes.asScala
schemaTypes.size == 2 &&
schemaTypes.head.getType != Schema.Type.NULL &&
schemaTypes(1).getType == Schema.Type.NULL
} else {
false
}
}
private def reorderUnionTypesForNullable(schema: Schema): Schema = {
if (needsReordering(schema)) {
val schemaTypes = schema.getTypes.asScala
Schema.createUnion(Schema.create(Schema.Type.NULL), schemaTypes.head)
} else {
schema
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy