org.apache.spark.sql.confluent.avro.ConfluentAvroDataToCatalyst.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-extensions_2.12 Show documentation
Show all versions of spark-extensions_2.12 Show documentation
Spark extensions for SmartDataLakeBuilder
The newest version!
package org.apache.spark.sql.confluent.avro
import io.confluent.kafka.schemaregistry.avro.AvroSchema
import org.apache.avro.generic.GenericDatumReader
import org.apache.avro.io.{BinaryDecoder, DecoderFactory}
import org.apache.spark.sql.avro.AvroDeserializer
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenerator, CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
import org.apache.spark.sql.confluent.ConfluentClient
import org.apache.spark.sql.internal.LegacyBehaviorPolicy
import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType}
import java.nio.ByteBuffer
import scala.collection.mutable
// copied from org.apache.spark.sql.avro.*
case class ConfluentAvroDataToCatalyst(child: Expression, subject: String, confluentHelper: ConfluentClient[AvroSchema])
extends UnaryExpression with ExpectsInputTypes {
override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType)
// Avro schema is not serializable in older versions. We must be careful to not store it in an attribute of the class.
@transient private lazy val subjectSchema = confluentHelper.getLatestSchemaFromConfluent(subject)._2
override lazy val dataType: DataType = AvroSchemaConverter.toSqlType(subjectSchema.rawSchema).dataType
override def nullable: Boolean = true
// To read an avro message we need to use the schema referenced by the message. Therefore we might need different readers for different messages.
private val avroReaders = mutable.Map[Int, GenericDatumReader[Any]]()
// To deserialize a generic avro message to a Spark row we need to use the Avro schema referenced by the message. Therefore we might need different deserializers for different messages.
private val avro2SparkDeserializers = mutable.Map[Int, AvroDeserializer]()
// buffer objects for reuse
private var avroBinaryDecoder: BinaryDecoder = _
private var avroGenericMsg: Any = _
override def nullSafeEval(input: Any): Any = {
val binary = input.asInstanceOf[Array[Byte]]
val (schemaId, avroMsg) = parseConfluentMsg(binary)
val (_, msgSchema) = confluentHelper.getSchemaFromConfluent(schemaId)
avroBinaryDecoder = DecoderFactory.get().binaryDecoder(avroMsg, 0, avroMsg.length, avroBinaryDecoder)
val avroReader = avroReaders.getOrElseUpdate(schemaId, new GenericDatumReader[Any](msgSchema.rawSchema, subjectSchema.rawSchema))
avroGenericMsg = avroReader.read(avroGenericMsg, avroBinaryDecoder)
val avro2SparkDeserializer = avro2SparkDeserializers.getOrElseUpdate(schemaId, new AvroDeserializer(subjectSchema.rawSchema, dataType, LegacyBehaviorPolicy.CORRECTED.toString, useStableIdForUnionType = false))
avro2SparkDeserializer.deserialize(avroGenericMsg).orNull
}
override def prettyName: String = "from_confluent_avro"
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val expr = ctx.addReferenceObj("this", this)
defineCodeGen(ctx, ev, input =>
s"(${CodeGenerator.boxedType(dataType)})$expr.nullSafeEval($input)")
}
def parseConfluentMsg(msg: Array[Byte]): (Int, Array[Byte]) = {
val msgBuffer = ByteBuffer.wrap(msg)
val magicByte = msgBuffer.get
require(magicByte == ConfluentAvroConnector.CONFLUENT_MAGIC_BYTE, "Magic byte not present at start of confluent message!")
val schemaId = msgBuffer.getInt
val avroMsg = msg.slice(msgBuffer.position, msgBuffer.limit)
//return
(schemaId, avroMsg)
}
override protected def withNewChildInternal(newChild: Expression): ConfluentAvroDataToCatalyst = copy(child = newChild)
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy