All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ebiznext.comet.job.transform.AutoTaskJob.scala Maven / Gradle / Ivy

There is a newer version: 0.2.6
Show newest version
/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one or more
 *  * contributor license agreements.  See the NOTICE file distributed with
 *  * this work for additional information regarding copyright ownership.
 *  * The ASF licenses this file to You under the Apache License, Version 2.0
 *  * (the "License"); you may not use this file except in compliance with
 *  * the License.  You may obtain a copy of the License at
 *  *
 *  *    http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 *
 */

package com.ebiznext.comet.job.transform

import java.io.{File, PrintStream}
import java.time.LocalDateTime

import com.ebiznext.comet.config.{Settings, StorageArea, UdfRegistration}
import com.ebiznext.comet.job.index.bqload.{
  BigQueryJobResult,
  BigQueryLoadConfig,
  BigQueryNativeJob
}
import com.ebiznext.comet.schema.handlers.StorageHandler
import com.ebiznext.comet.schema.model.SinkType.{BQ, FS, JDBC}
import com.ebiznext.comet.schema.model.{AutoTaskDesc, BigQuerySink, Engine, SinkType}
import com.ebiznext.comet.utils.Formatter._
import com.ebiznext.comet.utils.{JobResult, SparkJob, SparkJobResult, Utils}
import org.apache.hadoop.fs.Path

import scala.util.{Failure, Success, Try}

/** Execute the SQL Task and store it in parquet/orc/.... If Hive support is enabled, also store it as a Hive Table.
  * If analyze support is active, also compute basic statistics for twhe dataset.
  *
  * @param name        : Job Name as defined in the YML job description file
  * @param defaultArea : Where the resulting dataset is stored by default if not specified in the task
  * @param task        : Task to run
  * @param sqlParameters : Sql Parameters to pass to SQL statements
  */
class AutoTaskJob(
  override val name: String,
  defaultArea: Option[StorageArea],
  format: scala.Option[String],
  coalesce: Boolean,
  udf: scala.Option[String],
  views: scala.Option[Map[String, String]],
  engine: Engine,
  task: AutoTaskDesc,
  storageHandler: StorageHandler,
  sqlParameters: Map[String, String]
)(implicit val settings: Settings)
    extends SparkJob {

  override def run(): Try[JobResult] = {
    engine match {
      case Engine.BQ =>
        runBQ()
      case Engine.SPARK =>
        runSpark()
      case _ =>
        throw new Exception("Should never happen !!!s")
    }
  }

  val (createDisposition, writeDisposition) =
    Utils.getDBDisposition(task.write, hasMergeKeyDefined = false)

  private def createConfig(): BigQueryLoadConfig = {
    val bqSink = task.sink.map(sink => sink.asInstanceOf[BigQuerySink]).getOrElse(BigQuerySink())

    BigQueryLoadConfig(
      outputTable = task.dataset,
      outputDataset = task.domain,
      createDisposition = createDisposition,
      writeDisposition = writeDisposition,
      location = bqSink.location,
      outputPartition = bqSink.timestamp,
      outputClustering = bqSink.clustering.getOrElse(Nil),
      days = bqSink.days,
      requirePartitionFilter = bqSink.requirePartitionFilter.getOrElse(false),
      rls = task.rls,
      engine = Engine.BQ
    )
  }

  def runView(viewName: String, viewDir: Option[String], viewCount: Int): Try[JobResult] = {
    Try {
      val config = createConfig()
      val queryExpr = views
        .getOrElse(Map.empty)
        .getOrElse(viewName, throw new Exception(s"View with name $viewName not found"))
      val bqNativeJob = new BigQueryNativeJob(
        config,
        "DUMMY - NOT EXECUTED",
        udf
      )

      val jsonQuery =
        s"SELECT TO_JSON_STRING(t,false) FROM (${queryExpr.richFormat(sqlParameters)}) AS t"
      val result = bqNativeJob.runSQL(jsonQuery.richFormat(sqlParameters))
      import scala.collection.JavaConverters._
      result.tableResult.foreach { tableResult =>
        var count = 0
        val it = tableResult.iterateAll().iterator().asScala
        val file = viewDir
          .map { dir =>
            new File(dir).mkdirs()
            new PrintStream(new File(dir, s"$viewName.json"), "UTF-8")
          }
          .getOrElse(System.out)
        while (it.hasNext && count < viewCount) {
          val item = it.next().get(0).getStringValue
          file.println(item)
          count = count + 1
        }
        file.close()
      }
      result
    }
  }

  def runBQ(): Try[JobResult] = {
    val subSelects: String = views.getOrElse(Map.empty).map { case (queryName, queryExpr) =>
      queryName + " AS (" + queryExpr.richFormat(sqlParameters) + ")"
    } mkString ("WITH ", ",", " ")

    val config = createConfig()

    val bqNativeJob = new BigQueryNativeJob(
      config,
      task.sql.richFormat(sqlParameters + ("views" -> subSelects)),
      udf
    )

    val presqlResult: Try[Iterable[BigQueryJobResult]] = Try {
      task.presql.getOrElse(Nil).map { sql =>
        bqNativeJob.runSQL(sql.richFormat(sqlParameters))
      }
    }
    Utils.logFailure(presqlResult, logger)

    val jobResult: Try[JobResult] =
      bqNativeJob.run()
    Utils.logFailure(jobResult, logger)

    // We execute the post statements even if the main statement failed
    // We may be doing some cleanup here.
    val postsqlResult: Try[Iterable[BigQueryJobResult]] = Try {
      task.postsql.getOrElse(Nil).map { sql =>
        bqNativeJob.runSQL(sql.richFormat(sqlParameters))
      }
    }
    Utils.logFailure(postsqlResult, logger)

    val errors =
      Iterable(presqlResult, jobResult, postsqlResult).filter(_.isFailure).map(_.failed).map(_.get)
    errors match {
      case Nil =>
        Success(BigQueryJobResult(None))
      case _ =>
        Failure(errors.reduce(_.initCause(_)))
    }
  }

  def runSpark(): Try[SparkJobResult] = {
    udf.foreach { udf =>
      val udfInstance: UdfRegistration =
        Class
          .forName(udf)
          .getDeclaredConstructor()
          .newInstance()
          .asInstanceOf[UdfRegistration]
      udfInstance.register(session)
    }
    views.getOrElse(Map()).foreach { case (key, value) =>
      val sepIndex = value.indexOf(":")
      val (format, configName, path) =
        if (sepIndex > 0) {
          val key = value.substring(0, sepIndex)
          val sepConfigIndex = value.indexOf(':', sepIndex + 1)
          if (sepConfigIndex > 0) {
            (
              SinkType.fromString(value.substring(0, sepIndex)),
              Some(value.substring(sepIndex + 1, sepConfigIndex)),
              value.substring(sepConfigIndex + 1)
            )
          } else
            (SinkType.fromString(key), None, value.substring(sepIndex + 1))
        } else // parquet is the default
          (SinkType.FS, None, value)
      logger.info(s"Loading view $path from $format")
      val df = format match {
        case FS =>
          val fullPath =
            if (path.startsWith("/")) path else s"${settings.comet.datasets}/$path"
          session.read.parquet(fullPath)
        case JDBC =>
          val jdbcConfig =
            settings.comet.connections(configName.getOrElse((throw new Exception(""))))
          jdbcConfig.options
            .foldLeft(session.read)((w, kv) => w.option(kv._1, kv._2))
            .format(jdbcConfig.format)
            .option("query", path.richFormat(sqlParameters))
            .load()
            .cache()
        case BQ =>
          val TablePathWithFilter = "(.*)\\.comet_filter\\((.*)\\)".r
          val TablePathWithSelect = "(.*)\\.comet_select\\((.*)\\)".r
          val TablePathWithFilterAndSelect =
            "(.*)\\.comet_select\\((.*)\\)\\.comet_filter\\((.*)\\)".r
          path match {
            case TablePathWithFilterAndSelect(tablePath, select, filter) =>
              val filterFormat = filter.richFormat(sqlParameters)
              logger
                .info(s"We are loading the Table with columns: $select and filters: $filterFormat")
              session.read
                .option("readDataFormat", "AVRO")
                .format("com.google.cloud.spark.bigquery")
                .option("table", tablePath)
                .option("filter", filterFormat)
                .load()
                .selectExpr(select.replaceAll("\\s", "").split(","): _*)
                .cache()
            case TablePathWithFilter(tablePath, filter) =>
              val filterFormat = filter.richFormat(sqlParameters)
              logger.info(s"We are loading the Table with filters: $filterFormat")
              session.read
                .option("readDataFormat", "AVRO")
                .format("com.google.cloud.spark.bigquery")
                .option("table", tablePath)
                .option("filter", filterFormat)
                .load()
                .cache()
            case TablePathWithSelect(tablePath, select) =>
              logger.info(s"We are loading the Table with columns: $select")
              session.read
                .option("readDataFormat", "AVRO")
                .format("com.google.cloud.spark.bigquery")
                .option("table", tablePath)
                .load()
                .selectExpr(select.replaceAll("\\s", "").split(","): _*)
                .cache()
            case _ =>
              session.read
                .option("readDataFormat", "AVRO")
                .format("com.google.cloud.spark.bigquery")
                .option("table", path)
                .load()
                .cache()
          }
        case _ =>
          throw new Exception("Should never happen")
      }
      df.createOrReplaceTempView(key)
      logger.info(s"Created view $key")
    }

    task.presql
      .getOrElse(Nil)
      .foreach(req => session.sql(req.richFormat(sqlParameters)))
    val sqlWithParameters = task.sql.richFormat(sqlParameters)
    logger.info(s"running sql request $sqlWithParameters")
    val dataframe = session.sql(sqlWithParameters)

    val targetPath = task.getTargetPath(defaultArea)
    logger.info(s"About to write resulting dataset to $targetPath")
    // Target Path exist only if a storage area has been defined at task or job level
    targetPath.map { targetPath =>
      val partitionedDF =
        partitionedDatasetWriter(
          if (coalesce) dataframe.coalesce(1) else dataframe,
          task.getPartitions()
        )

      val finalDataset = partitionedDF
        .mode(task.write.toSaveMode)
        .format(format.getOrElse(settings.comet.defaultWriteFormat))
        .option("path", targetPath.toString)

      if (settings.comet.hive) {
        val tableName = task.dataset
        val hiveDB = task.getHiveDB(defaultArea)
        hiveDB.map { hiveDB =>
          val fullTableName = s"$hiveDB.$tableName"
          session.sql(s"create database if not exists $hiveDB")
          session.sql(s"use $hiveDB")
          session.sql(s"drop table if exists $tableName")
          finalDataset.saveAsTable(fullTableName)
          analyze(fullTableName)
        }
      } else {
        finalDataset.save()
        if (coalesce) {
          val extension = format.getOrElse(settings.comet.defaultWriteFormat)
          val csvPath = storageHandler.list(targetPath, s".$extension", LocalDateTime.MIN).head
          val finalPath = new Path(targetPath, targetPath.getName + s".$extension")
          storageHandler.move(csvPath, finalPath)
        }
      }
    }

    task.postsql.getOrElse(Nil).foreach(session.sql)
    // Let us return the Dataframe so that it can be piped to another sink
    Success(SparkJobResult(Some(dataframe)))
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy