com.ebiznext.comet.job.ingest.XmlIngestionJob.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of comet-spark3_2.12 Show documentation
comet-spark3
There is a newer version: 0.2.6
/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one or more
 *  * contributor license agreements.  See the NOTICE file distributed with
 *  * this work for additional information regarding copyright ownership.
 *  * The ASF licenses this file to You under the Apache License, Version 2.0
 *  * (the "License"); you may not use this file except in compliance with
 *  * the License.  You may obtain a copy of the License at
 *  *
 *  *    http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 *
 */

package com.ebiznext.comet.job.ingest

import com.ebiznext.comet.config.Settings
import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler}
import com.ebiznext.comet.schema.model._
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.datasources.json.JsonIngestionUtil.compareTypes
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row}

import scala.util.{Failure, Success, Try}

/** Main class to complex json delimiter separated values file
  * If your json contains only one level simple attribute aka. kind of dsv but in json format please use SIMPLE_JSON instead. It's way faster
  *
  * @param domain         : Input Dataset Domain
  * @param schema         : Input Dataset Schema
  * @param types          : List of globally defined types
  * @param path           : Input dataset path
  * @param storageHandler : Storage Handler
  */
class XmlIngestionJob(
  val domain: Domain,
  val schema: Schema,
  val types: List[Type],
  val path: List[Path],
  val storageHandler: StorageHandler,
  val schemaHandler: SchemaHandler
)(implicit val settings: Settings)
    extends IngestionJob {

  /** load the json as an RDD of String
    *
    * @return Spark Dataframe loaded using metadata options
    */
  def loadDataSet(): Try[DataFrame] = {
    try {
      val rowTag = metadata.xml.flatMap(_.get("rowTag"))
      rowTag.map { rowTag =>
        val df = path
          .map { singlePath =>
            session.read
              .format("com.databricks.spark.xml")
              .option("rowTag", rowTag)
              .option("inferSchema", value = false)
              .option("encoding", metadata.getEncoding())
              .load(singlePath.toString)
          }
          .reduce((acc, df) => acc union df)
        df.printSchema()
        Success(df)
      } getOrElse (Failure(
        throw new Exception(s"rowTag not found for schema ${domain.name}.${schema.name}")
      ))
    } catch {
      case e: Exception =>
        Failure(e)
    }
  }

  lazy val schemaSparkType: StructType = schema.sparkType(schemaHandler)

  /** Where the magic happen
    *
    * @param dataset input dataset as a RDD of string
    */
  def ingest(dataset: DataFrame): (RDD[_], RDD[_]) = {
    dataset.printSchema()
    val datasetSchema = dataset.schema
    val errorList = compareTypes(schemaSparkType, datasetSchema)
    val rejectedRDD = session.sparkContext.parallelize(errorList)
    saveRejected(rejectedRDD)
    val (df, path) = saveAccepted(dataset) // prefer to let Spark compute the final schema
    (rejectedRDD, dataset.rdd)
  }

  override def name: String = "JsonJob"
}

object XmlIngestionJob {

  def parseRDD(
    inputRDD: RDD[Row],
    schemaSparkType: StructType
  ): RDD[Either[List[String], Row]] = {
    inputRDD.mapPartitions { partition =>
      partition.map { row =>
        val rowSchema = row.schema
        val errorList = compareTypes(schemaSparkType, rowSchema)
        if (errorList.isEmpty)
          Right(row)
        else
          Left(errorList)
      }
    }
  }
}