org.apache.spark.sql.confluent.json.JsonToSparkSchemaConverter.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-extensions_2.12 Show documentation
Spark extensions for SmartDataLakeBuilder
The newest version!
package org.apache.spark.sql.confluent.json

import org.apache.spark.sql.confluent.json.JsonSchemaConverter._
import org.apache.spark.sql.types._
import org.json4s._

import scala.annotation.tailrec

/**
 * This code originates from https://github.com/zalando-incubator/spark-json-schema and is protected by its corresponding MIT license.
 * We added support for date-time datatype, and changed from using play-json library to json4s to minimize SDL dependencies.
 * There is some support for additionalProperties, required properties and oneOf.
 *
 * Motivation for copying vs creating a PR: It seems that the original version isn't maintained very well. An open PR exists for
 * timestamp and decimal datatype support. It is open for nearly one year, see https://github.com/zalando-incubator/spark-json-schema/pull/49.
 * Also to change from play-json to json4s would be hard to argument on the original library.
 */

/**
 * Schema Converter for converting schema in json format into a spark schema
 *
 * The given schema for spark has almost no validity checks, so it will make sense
 * to combine this with the schema-validator. For loading data with schema, data is converted
 * to the type given in the schema. If this is not possible the whole row will be null (!).
 * A field can be null if its type is a 2-element array, one of which is "null". The converted
 * schema doesn't check for 'enum' fields, i.e. fields which are limited to a given set.
 * It also doesn't check for required fields.
 * If a field is specified in the schema, than you can select it and it will
 * be null if missing. If a field is not in the schema, it cannot be selected even if
 * given in the dataset.
 *
 * @param inputSchema                 The Json schema to convert
 * @param isStrictTypingEnabled       if isStrictTypingEnabled=true:
 *                                    - union types (oneOf) are merged if rational, otherwise they are simply mapped to StringType
 *                                    - additional properties are ignored, otherwise the corresponding schema object is mapped to MapType(String,String)
 * @param additionalPropertiesDefault This is the default value for 'additionalProperties'-field if it is missing in a schema with type='object'.
 *                                    Default value is additionalPropertiesDefault=true, as this is conform with the specification.
 * @param definitionsPath             Path in schema to look for Definitions. Definitions are needed to resolve references.
 *                                    See also https://cswr.github.io/JsonSchema/spec/definitions_references/.
 *                                    Default is "definitions", but for OpenAPI Spec this has to be "components", see also https://swagger.io/docs/specification/components/.
 *
 */
class JsonToSparkSchemaConverter(inputSchema: JValue,
                                 isStrictTypingEnabled: Boolean = true,
                                 additionalPropertiesDefault: Boolean = true,
                                 definitionsPath: String = Definitions
                                ) {
  implicit val format: Formats = DefaultFormats

  lazy val definitions: JObject = (inputSchema \ definitionsPath).extractOpt[JObject]
    .getOrElse(JObject())

  def convert(): DataType = {
    val name = getJsonName(inputSchema).getOrElse(SchemaRoot)
    val schemaType = convertAnyType(inputSchema, name)
    schemaType.dataType
  }

  private def getJsonName(json: JValue): Option[String] = (json \ SchemaFieldName).extractOpt[String]

  private def getJsonId(json: JValue): Option[String] = (json \ SchemaFieldId).extractOpt[String]

  private def getJsonDescription(json: JValue): Option[String] = (json \ SchemaFieldDescription).extractOpt[String]

  private def getJsonAdditionalProperties(json: JValue): Boolean = {
    (json \ SchemaFieldAdditionalProperties).extractOpt[Boolean].getOrElse(additionalPropertiesDefault)
  }

  private def convertJsonArray(obj: JObject, name: String, nullable: Boolean): SchemaType = {
    val resolvedObj = resolveRefs(obj)
    val resolvedName = getJsonId(resolvedObj).getOrElse(name)
    // parse items
    val items = resolvedObj \ SchemaFieldItems
    val arrType = items match {
      case JNothing => throw new IllegalStateException(s"No 'items'-field found in schema at $resolvedName")
      case v => convertAnyType(v, resolvedName)
    }
    SchemaType(ArrayType(arrType.dataType, arrType.nullable), nullable)
  }

  private def convertJsonObject(obj: JObject, name: String, nullable: Boolean): SchemaType = {
    val resolvedObj = resolveRefs(obj)
    val resolvedName = getJsonId(resolvedObj).getOrElse(name)
    // parse properties
    val properties = resolvedObj \ SchemaFieldProperties
    val required = (resolvedObj \ SchemaFieldRequired).extractOpt[JArray]
    val requiredFields = required.map(_.arr.collect { case str: JString => str.s }).getOrElse(Seq())
    val additionalProperties = getJsonAdditionalProperties(resolvedObj)
    val structType = properties match {
      case _: JObject if additionalProperties && !isStrictTypingEnabled => MapType(StringType, StringType)
      case v: JObject => convertJsonProperties(v, requiredFields)
      case JNothing if additionalProperties => MapType(StringType, StringType)
      case JNothing => throw new IllegalStateException(s"No 'properties'-field found in schema at $resolvedName")
      case x => throw new IllegalStateException(s"Converting properties for $resolvedName but properties type is $x instead of object")
    }
    SchemaType(structType, nullable)
  }

  private def convertJsonProperties(properties: JObject, required: Seq[String]): StructType = {
    val fields = properties.obj.map { case (k, v) =>
      val fieldType = convertAnyType(v, k)
      val field = StructField(getJsonName(v).getOrElse(k), fieldType.dataType, nullable = fieldType.nullable || !required.contains(k))
      // add description as metadata if defined
      val description = getJsonDescription(v)
      description.map(field.withComment).getOrElse(field)
    }
    StructType(fields)
  }

  private def mergeObjectTypes(obj1: JObject, obj2: JObject, name: String): JObject = {
    val type1 = extractType(obj1, name)
    val type2 = extractType(obj2, name)
    assert(type1.json == JString("object"), "type1 must be object")
    assert(type2.json == JString("object"), "type2 must be object")
    val mergedType = if (type1.nullable || type2.nullable) JArray(List(JString("object"), JString("null"))) else JString("object")
    JObject(List(
      JField(SchemaFieldType, mergedType),
      JField(SchemaFieldProperties, (obj1 \ SchemaFieldProperties).merge(obj2 \ SchemaFieldProperties)),
      JField(SchemaFieldRequired, JArray(List((obj1 \ SchemaFieldRequired).extractOpt[Seq[String]], (obj2 \ SchemaFieldRequired).extractOpt[Seq[String]]).flatten.reduceLeft(_ intersect _).map(JString).toList)),
      JField(SchemaFieldAdditionalProperties, JBool(Seq((obj1 \ SchemaFieldAdditionalProperties).extractOpt[Boolean], (obj2 \ SchemaFieldAdditionalProperties).extractOpt[Boolean], Some(false)).flatten.reduceLeft(_ || _))),
    ))
  }

  private def mergeArrayTypes(obj1: JObject, obj2: JObject, name: String): JObject = {
    val type1 = extractType(obj1, name)
    val type2 = extractType(obj2, name)
    assert(type1.json == JString("array"), "type1 must be array")
    assert(type2.json == JString("array"), "type2 must be array")
    val items1 = type1.json match {
      case JArray(arr) => arr
      case x => Seq(x)
    }
    val items2 = type2.json match {
      case JArray(arr) => arr
      case x => Seq(x)
    }
    val mergedItems = (items1 ++ items2).reduceLeft(mergeTypes(name))
    val mergedType = if (type1.nullable || type2.nullable) JArray(List(JString("array"), JString("null"))) else JString("array")
    JObject(List(
      JField(SchemaFieldType, mergedType),
      JField(SchemaFieldItems, mergedItems),
    ))
  }

  private def resolveRefsIfObj(json: JValue) = {
    json match {
      case jsonObj: JObject => resolveRefs(jsonObj)
      case _ => json
    }
  }

  private def mergeTypes(name: String)(jsonType1: JValue, jsonType2: JValue): JValue = {
    val resolvedType1 = resolveRefsIfObj(jsonType1)
    val resolvedType2 = resolveRefsIfObj(jsonType2)
    val type1 = extractType(resolvedType1, name).json
    val type2 = extractType(resolvedType2, name).json
    Seq(type1, type2).sortBy(_.toString) match {
      case Seq(JString("object"), JString("object")) =>
        mergeObjectTypes(resolvedType1.asInstanceOf[JObject], resolvedType2.asInstanceOf[JObject], name)
      case Seq(JString("object"), _) | Seq(_, JString("object")) =>
        if (isStrictTypingEnabled) throw new IllegalArgumentException(s"Cannot unify types <$resolvedType1> and <$resolvedType2> in schema at <$name>")
        else JString("string")
      case Seq(JString("array"), JString("array")) =>
        mergeArrayTypes(resolvedType1.asInstanceOf[JObject], resolvedType2.asInstanceOf[JObject], name)
      case Seq(JString("array"), _) | Seq(_, JString("array")) =>
        if (isStrictTypingEnabled) throw new IllegalArgumentException(s"Cannot unify types <$resolvedType1> and <$resolvedType2> in schema at <$name>")
        else JString("string")
      case Seq(JString("string"), _) | Seq(_, JString("string")) => JString("string")
      case Seq(JString("float"), JString("number")) => JString("number")
      case Seq(JString("integer"), JString("number")) => JString("number")
      case Seq(JString("integer"), JString("float")) => JString("number")
      case _ =>
        if (isStrictTypingEnabled) throw new IllegalArgumentException(s"Cannot unify types <$resolvedType1> and <$resolvedType2> in schema at <$name>")
        else JString("string")
    }
  }

  private case class NullableType(json: JValue, nullable: Boolean)

  private def extractArrayType(array: Seq[JValue], name: String): NullableType = {
    val nullable = array.contains(JString("null"))
    array.size match {
      case 1 if nullable =>
        throw new IllegalArgumentException(s"Null type only is not supported at <$name>")
      case 1 =>
        NullableType(array.head, nullable = false)
      case 2 if nullable =>
        array.find(_ != JString("null"))
          .map(jsonType => NullableType(jsonType, nullable = true))
          .getOrElse(throw new IllegalArgumentException(s"Incorrect definition of a nullable parameter at <$name>"))
      case _ =>
        NullableType(array.filter(_ != JString("null")).distinct.reduceLeft(mergeTypes(name)), nullable)
    }
  }

  @tailrec
  private def extractType(json: JValue, name: String): NullableType = json match {
    case str: JString => NullableType(str, nullable = false)
    case JObject(entries) if entries.isEmpty && isStrictTypingEnabled =>
      //TODO: map to variant type in Spark 4.0
      NullableType(JString("string"), nullable = true)
    case jsonObj: JObject =>
      val resolvedJsonObj = resolveRefs(jsonObj)
      // prepare SchemaFieldFormat (Airbyte extension)
      val temporalTypeWithoutTimeZone = (resolvedJsonObj \ SchemaFieldAirbyteType).toOption.collect { case x: JString => x }
        .exists(_.s.endsWith("without_timezone"))
      val jsonFormat = (resolvedJsonObj \ SchemaFieldFormat).toOption.collect { case x: JString => x }
        .map(str => JString(str.s + (if (temporalTypeWithoutTimeZone) "-ntz" else "")))
      // prepare jsonType: format as higher prio than fieldType.
      val jsonType = jsonFormat
        .orElse((resolvedJsonObj \ SchemaFieldType).toOption)
        .orElse((resolvedJsonObj \ SchemaFieldOneOf).toOption)
      assert(jsonType.isDefined, throw new IllegalArgumentException(s"No 'type'-field in schema at <$name>"))
      extractType(jsonType.get, name)
    case JArray(arr) => extractArrayType(arr, name)
    case JNothing =>
      throw new IllegalArgumentException(s"No 'type'-field in schema at <$name>")
    case t =>
      throw new IllegalArgumentException(s"Unsupported type <${t.toString}> in schema at <$name>")
  }

  def convertObjWithType(jsonObj: JObject, jsonType: JValue, name: String, nullable: Boolean): SchemaType = {
    jsonType match {
      case JString("object") => convertJsonObject(jsonObj, name, nullable)
      case JString("array") => convertJsonArray(jsonObj, name, nullable)
      case JString(str) if str=="null" => throw new IllegalArgumentException(s"type of object is 'null' at <$name>")
      case JString(str) => SchemaType(JsonToSparkTypeMap.getOrElse(str.trim.toLowerCase, StringType), nullable)
    }
  }

  def convertAnyType(json: JValue, name: String, nullable: Boolean = false): SchemaType = {
    val resolvedName = getJsonId(json).getOrElse(name)
    val jsonType = extractType(json, resolvedName)
    json match {
      case JString(str) => SchemaType(JsonToSparkTypeMap.getOrElse(str.trim.toLowerCase, StringType), jsonType.nullable || nullable)
      case JObject(entries) if entries.isEmpty && isStrictTypingEnabled =>
        //TODO: map to variant type in Spark 4.0
        SchemaType(StringType, nullable = true)
      case _: JArray => convertAnyType(jsonType.json, name, jsonType.nullable || nullable)
      case _ if jsonType.json.isInstanceOf[JObject] => convertAnyType(jsonType.json, name, jsonType.nullable || nullable)
      case jsonObj: JObject =>
        val resolvedJsonObj = resolveRefs(jsonObj)
        convertObjWithType(resolvedJsonObj, jsonType.json, resolvedName, jsonType.nullable || nullable)
      case JNothing =>
        throw new IllegalArgumentException(s"No 'type'-field in schema at <$resolvedName>")
      case t =>
        throw new IllegalArgumentException(s"Unsupported type <${t.toString}> in schema at <$resolvedName>")
    }
  }

  private def resolveRefs(inputJson: JObject): JObject = {
    val schemaRef = (inputJson \ Reference).extractOpt[String]
    schemaRef match {
      case Some(loc) =>
        val searchDefinitions = definitionsPath + "/"
        val defIndex = loc.indexOf(searchDefinitions) match {
          case -1 => throw new NoSuchElementException(
            s"Field with name [$Reference] requires path with path element '[$searchDefinitions]'"
          )
          case i: Int => i + searchDefinitions.length
        }
        val pathNodes = loc.drop(defIndex).split("/").toList
        val definition = pathNodes.foldLeft(definitions: JValue) { case (obj, node) => obj \ node } match {
          case obj: JObject => obj
          case JNothing => throw new NoSuchElementException(s"Path [$loc] not found in $definitionsPath")
          case x => throw new NoSuchElementException(s"Path [$loc] in $definitionsPath is of type ${x.getClass.getSimpleName} instead JObject")
        }
        definition
      case None => inputJson
    }
  }
}