All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.starlake.config.Settings.scala Maven / Gradle / Ivy

/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one or more
 *  * contributor license agreements.  See the NOTICE file distributed with
 *  * this work for additional information regarding copyright ownership.
 *  * The ASF licenses this file to You under the Apache License, Version 2.0
 *  * (the "License"); you may not use this file except in compliance with
 *  * the License.  You may obtain a copy of the License at
 *  *
 *  *    http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 *
 */

package ai.starlake.config

import ai.starlake.config.Settings.AppConfig
import ai.starlake.config.Settings.JdbcEngine.TableDdl
import ai.starlake.job.load.LoadStrategy
import ai.starlake.job.validator.GenericRowValidator
import ai.starlake.schema.generator.Yml2DagTemplateLoader
import ai.starlake.schema.handlers._
import ai.starlake.schema.model.ConnectionType.JDBC
import ai.starlake.schema.model._
import ai.starlake.sql.SQLUtils
import ai.starlake.transpiler.JSQLTranspiler
import ai.starlake.utils._
import better.files.File
import com.fasterxml.jackson.annotation.{JsonIgnore, JsonIgnoreProperties}
import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper}
import com.typesafe.config.{Config, ConfigFactory, ConfigValueFactory}
import com.typesafe.scalalogging.StrictLogging
import org.apache.commons.lang.SystemUtils
import org.apache.hadoop.fs.Path
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerConfig
import org.apache.spark.SparkConf
import org.apache.spark.sql.jdbc.JdbcDialect
import org.apache.spark.storage.StorageLevel
import pureconfig.ConvertHelpers._
import pureconfig._
import pureconfig.generic.auto._
import pureconfig.generic.{FieldCoproductHint, ProductHint}

import java.io.ObjectStreamException
import java.net.URI
import java.sql.DriverManager
import java.util.concurrent.TimeUnit
import java.util.{Locale, Properties, TimeZone, UUID}
import scala.annotation.nowarn
import scala.concurrent.duration.FiniteDuration
import scala.jdk.CollectionConverters._
import scala.util.{Failure, Success, Try}

object Settings extends StrictLogging {
  val latestSchemaVersion: Int = 1
  implicit def hint[A]: ProductHint[A] = ProductHint[A](ConfigFieldMapping(CamelCase, CamelCase))
  private var _referenceConfig: Config = ConfigFactory.load()
  def referenceConfig: Config = _referenceConfig
  private val referenceClassLoader = Thread.currentThread().getContextClassLoader
  def invalidateCaches(): Unit = {
    ConfigFactory.invalidateCaches()
    _referenceConfig = ConfigFactory.load(referenceClassLoader)
  }

  /** datasets in the data pipeline go through several stages and are stored on disk at each of
    * these stages. This setting allow to customize the folder names of each of these stages.
    *
    * @param stage
    *   : Name of the pending area
    * @param unresolved
    *   : Named of the unresolved area
    * @param archive
    *   : Name of the archive area
    * @param ingesting
    *   : Name of the ingesting area
    */
  @JsonIgnoreProperties(
    Array("acceptedFinal", "rejectedFinal", "businessFinal", "replayFinal")
  )
  final case class Area(
    incoming: String,
    stage: String,
    unresolved: String,
    archive: String,
    ingesting: String,
    replay: String,
    hiveDatabase: String
  ) {
    val replayFinal: String = replay.toLowerCase(Locale.ROOT)
  }

  /** @param options
    *   : Map of privacy algorightms name -> PrivacyEngine
    */
  final case class Privacy(options: Map[String, String])

  final case class Elasticsearch(active: Boolean, options: Map[String, String])

  /** @param discreteMaxCardinality
    *   : Max number of unique values allowed in cardinality compute
    */
  final case class Metrics(
    path: String,
    discreteMaxCardinality: Int,
    active: Boolean
  ) // sinked to audit

  final case class ExpectationsConfig(
    path: String,
    active: Boolean,
    failOnError: Boolean
  ) // sinked to expectations

  final case class Audit(
    path: String,
    sink: AllSinks,
    maxErrors: Int,
    database: Option[String],
    domain: Option[String],
    active: Option[Boolean],
    sql: Option[String],
    domainExpectation: Option[String],
    domainRejected: Option[String]
  ) {
    def isActive(): Boolean = this.active.getOrElse(false)

    def getConnectionRef()(implicit settings: Settings): String =
      this.sink.connectionRef.getOrElse(settings.appConfig.connectionRef)

    def getConnection()(implicit settings: Settings): Connection =
      settings.appConfig.connections(this.getConnectionRef())

    def getSink()(implicit settings: Settings) =
      this.sink.getSink()

    def getDatabase()(implicit settings: Settings): Option[String] =
      this.database.orElse(settings.appConfig.getDefaultDatabase())

    def getDomain()(implicit settings: Settings): String =
      this.domain.getOrElse("audit")

    def getDomainExpectation()(implicit settings: Settings): String =
      this.domainExpectation.getOrElse("audit")

    def getDomainRejected()(implicit settings: Settings): String =
      this.domainRejected.getOrElse("audit")
  }

  /** Describes a connection to a JDBC-accessible database engine
    *
    * @param sparkFormat
    *   source / sink format (jdbc by default). Cf spark.format possible values
    * @param options
    *   any option required by the format used to ingest / tranform / compute the data. Eg for JDBC
    *   uri, user and password are required uri the URI of the database engine. It must start with
    *   "jdbc:" user the username under which to connect to the database engine password the
    *   password to use in order to connect to the database engine
    */
  final case class Connection(
    `type`: ConnectionType,
    sparkFormat: Option[String] = None,
    quote: Option[String] = None,
    separator: Option[String] = None,
    options: Map[String, String] = Map.empty,
    _transpileDialect: Option[String] = None
  ) {
    override def toString: String = {
      val redactOptions = Utils.redact(options)
      s"""Connection(
         |    type=${`type`},
         |    sparkFormat=$sparkFormat,
         |    quote=$quote,
         |    separator=$separator,
         |    options=$redactOptions
         |)""".stripMargin
    }

    def sparkDatasource(): Option[String] = {
      this.`type` match {
        case ConnectionType.JDBC =>
          val engineName = options("url").split(':')(1).toLowerCase()
          this.sparkFormat match {
            case Some(_) =>
              engineName match {
                case "snowflake"                                 => Some("snowflake")
                case "redshift" if Utils.isRunningInDatabricks() => Some("redshift")
                case "redshift" => Some("io.github.spark_redshift_community.spark.redshift")
                case _          => Some("jdbc")
              }
            case None => None
          }
        case ConnectionType.BQ => Some("bigquery")
        case _                 => None
      }

    }

    def this() = this(ConnectionType.JDBC, None, None, None, Map.empty)

    def checkValidity()(implicit settings: Settings): List[ValidationMessage] = {
      var errors = List.empty[ValidationMessage]
      val defaultTypes = new Path(DatasetArea.types, "default.sl.yml")
      if (!settings.storageHandler().exists(defaultTypes))
        errors = errors :+ ValidationMessage(
          Severity.Error,
          "Types",
          s"File not found: ${defaultTypes.toString}"
        )

      val globalsCometPath = DatasetArea.env()
      if (!settings.storageHandler().exists(globalsCometPath))
        errors = errors :+ ValidationMessage(
          Severity.Warning,
          "Environment",
          s"env.sl.comet not found in ${globalsCometPath.toString}"
        )

      `type` match {
        case ConnectionType.JDBC =>
          if (!options.contains("url")) {
            errors = errors :+ ValidationMessage(
              Severity.Error,
              "Connection",
              s"Connection type ${`type`} requires a url"
            )
          }
          sparkDatasource() match {
            case Some(datasource) =>
              if (datasource.contains("redshift")) {
                if (options.get("aws_iam_role").isEmpty) {
                  errors = errors :+ ValidationMessage(
                    Severity.Error,
                    "Connection",
                    s"Connection type ${`type`} requires an aws_iam_role"
                  )
                }
                if (options.get("tempdir").isEmpty) {
                  errors = errors :+ ValidationMessage(
                    Severity.Error,
                    "Connection",
                    s"Connection type ${`type`} requires an tempdir"
                  )
                }
              }
              if (datasource.contains("snowflake")) {
                if (options.get("warehouse").isEmpty) {
                  errors = errors :+ ValidationMessage(
                    Severity.Error,
                    "Connection",
                    s"Connection type ${`type`} requires an warehouse"
                  )
                }
                if (options.get("db").isEmpty) {
                  errors = errors :+ ValidationMessage(
                    Severity.Error,
                    "Connection",
                    s"Connection type ${`type`} requires an db"
                  )
                }
              }
            case None =>
          }
        case ConnectionType.BQ =>
          if (!options.contains("location")) {
            errors = errors :+ ValidationMessage(
              Severity.Error,
              "Connection",
              s"Connection type ${`type`} requires a location"
            )
          }
          if (this.sparkFormat.isDefined) {
            val isIndirectWriteMethod = options.getOrElse("writeMethod", "indirect") == "indirect"
            if (isIndirectWriteMethod && !options.contains("gcsBucket")) {
              errors = errors :+ ValidationMessage(
                Severity.Error,
                "Connection",
                s"Connection type ${`type`} requires a gcsBucket"
              )
            }
            if (isIndirectWriteMethod && !options.contains("temporaryGcsBucket")) {
              errors = errors :+ ValidationMessage(
                Severity.Warning,
                "Connection",
                s"Connection type ${`type`}: using gcsBucket as temporaryGcsBucket"
              )
            }
            if (!settings.sparkConfig.hasPath("datasource.bigquery.materializationDataset")) {
              errors = errors :+ ValidationMessage(
                Severity.Error,
                "Connection",
                s"Connection type ${`type`} requires spark.datasource.bigquery.materializationDataset"
              )
            }
          }

          options.getOrElse("authType", "") match {
            case "APPLICATION_DEFAULT" =>
              if (!options.contains("authScopes")) {
                errors = errors :+ ValidationMessage(
                  Severity.Warning,
                  "Connection",
                  s"authScopes not defined in Connection type ${`type`}. Using 'https://www.googleapis.com/auth/cloud-platform'"
                )
              }
            case "SERVICE_ACCOUNT_JSON_KEYFILE" =>
              if (!options.contains("jsonKeyfile")) {
                errors = errors :+ ValidationMessage(
                  Severity.Error,
                  "Connection",
                  s"Connection type ${`type`} requires a jsonKeyfile"
                )
              }
            case "USER_CREDENTIALS" =>
              val clientId = options.get("clientId")
              val clientSecret = options.get("clientSecret")
              val refreshToken = options.get("refreshToken")
              if (clientId.isEmpty || clientSecret.isEmpty || refreshToken.isEmpty) {
                errors = errors :+ ValidationMessage(
                  Severity.Error,
                  "Connection",
                  s"Connection type ${`type`} requires a clientId, clientSecret and refreshToken options"
                )
              }
            case "ACCESS_TOKEN" =>
              val accessToken = options.get("gcpAccessToken")
              if (accessToken.isEmpty) {
                errors = errors :+ ValidationMessage(
                  Severity.Error,
                  "Connection",
                  s"Connection type ${`type`} requires a gcpAccessToken option"
                )
              }
            case _ =>
              errors = errors :+ ValidationMessage(
                Severity.Error,
                "Connection",
                s"Connection type ${`type`} requires an authType"
              )
          }
        case _ =>
      }
      errors
    }

    /** The engine is Spark when sparkFormat is defined or when the connection type is not bigquery
      * @return
      *   the engine Spark or Bigquery only
      */
    @JsonIgnore
    def getEngine(): Engine = {
      if (sparkFormat.isDefined) Engine.SPARK
      else {
        `type` match {
          case ConnectionType.BQ   => Engine.BQ
          case ConnectionType.JDBC => Engine.JDBC
          case _                   => Engine.SPARK
        }
      }
    }

    @JsonIgnore
    def getDbName() = {
      `type` match {
        case ConnectionType.BQ => "bigquery"
        case ConnectionType.JDBC =>
          val engineName = options("url").split(':')(1).toLowerCase()
          engineName
        case _ => "spark"
      }
    }

    @nowarn
    def datawareOptions(): Map[String, String] =
      options.filterKeys(!Connection.allstorageOptions.contains(_)).toMap

    @nowarn
    def authOptions(): Map[String, String] =
      options.filterKeys(Connection.allstorageOptions.contains(_)).toMap

    @JsonIgnore
    def getJdbcEngineName(): Engine = {
      val engineName = sparkFormat match {
        case None | Some("jdbc") =>
          this.`type` match {
            case JDBC =>
              val engineName = options("url").split(':')(1).toLowerCase()
              if (engineName == "databricks")
                "spark"
              else engineName
            case ConnectionType.BQ => "bigquery"
            case _                 =>
              // if this is a jdbc url (aka snowflake, redshift ...)
              options
                .get("url")
                .map(_.split(':')(1))
                .getOrElse("spark")
          }
        case Some(_) =>
          // if this is a jdbc url (aka snowflake, redshift ...)
          options
            .get("url")
            .map(_.split(':')(1))
            .getOrElse("spark")

      }
      Engine.fromString(engineName)
    }

    @JsonIgnore
    def isBigQuery() = this.`type` == ConnectionType.BQ

    @JsonIgnore
    def isSnowflake(): Boolean = getJdbcEngineName().toString == "snowflake"

    @JsonIgnore
    def isSpark(): Boolean =
      getJdbcEngineName().toString == "spark" || this.`type` == ConnectionType.FS

    @JsonIgnore
    def isJdbcUrl() = this.options.get("url").exists(_.startsWith("jdbc"))

    @JsonIgnore
    def isRedshift(): Boolean = getJdbcEngineName().toString == "redshift"

    @JsonIgnore
    def isPostgreSql(): Boolean = getJdbcEngineName().toString == "postgresql"

    @JsonIgnore
    def isMySQLOrMariaDb(): Boolean = isMySQL() || isMariaDb()

    @JsonIgnore
    def isMySQL(): Boolean = getJdbcEngineName().toString == "mysql"

    @JsonIgnore
    def isMariaDb(): Boolean = getJdbcEngineName().toString == "mariadb"

    @JsonIgnore
    def isDuckDb(): Boolean = getJdbcEngineName().toString == "duckdb"

    @JsonIgnore
    lazy val jdbcUrl: String = applyIfConnectionTypeIs(
      ConnectionType.JDBC,
      options.getOrElse(
        "url",
        throw new RuntimeException(s"Missing url in connection options.")
      )
    )

    @JsonIgnore
    lazy val dialect: JdbcDialect =
      applyIfConnectionTypeIs(ConnectionType.JDBC, SparkUtils.dialect(jdbcUrl))

    def quoteIdentifier(identifier: String): String = dialect.quoteIdentifier(identifier)

    def mergeOptionsWith(additionalConnectionOptions: Map[String, String]): Connection = {
      this.copy(options = options ++ additionalConnectionOptions)
    }

    private def applyIfConnectionTypeIs[T](connectionType: ConnectionType, action: => T): T = {
      `type` match {
        case `connectionType` => action
        case _ =>
          throw new RuntimeException(s"Can only be used for ${`type`} connection type")
      }
    }
  }

  object Connection {
    val gcsOptions = List(
      "gcsBucket",
      "temporaryGcsBucket",
      "authType",
      "jsonKeyfile",
      "clientId",
      "clientSecret",
      "refreshToken"
    )
    val azureOptions = List(
      "azureStorageContainer",
      "azureStorageAccount",
      "azureStorageKey"
    )
    val s3Options = Nil

    val allstorageOptions = gcsOptions ++ azureOptions ++ s3Options
  }
  final case class Connections(connections: Map[String, Connection] = Map.empty)

  /** Describes how to use a specific type of JDBC-accessible database engine
    *
    * @param tables
    *   for each of the Standard Table Names used by Comet, the specific SQL DDL statements as
    *   expected in the engine's own dialect.
    */
  final case class JdbcEngine(
    tables: Map[String, TableDdl],
    quote: String,
    viewPrefix: Option[String],
    preActions: Option[String],
    strategyBuilder: String,
    columnRemarks: Option[String] = None,
    tableRemarks: Option[String] = None,
    supportsJson: Option[Boolean] = None
  )

  object JdbcEngine {

    /** A descriptor of the specific SQL DDL statements required to manage a specific Comet table in
      * a JDBC-accessible database engine
      *
      * @param createSql
      *   the SQL Create Table statement with the database-specific type, constraints etc. tacked
      *   on.
      * @param pingSql
      *   a cheap SQL query whose results are irrelevant but guaranteed to trigger an error in case
      *   the table is absent
      * @note
      *   pingSql is optional, and will default to `select * from `name` where 1=0` as Spark SQL
      *   does
      */
    final case class TableDdl(
      createSql: String,
      pingSql: Option[String],
      selectSql: Option[String] = None
    ) {

      def effectivePingSql(tableName: String): String =
        pingSql.getOrElse(s"select count(*) from $tableName where 1=0")
    }
  }

  final case class Http(
    interface: String,
    port: Int
  )

  final case class Lock(
    path: String,
    timeout: Long,
    pollTime: FiniteDuration = FiniteDuration(5000L, TimeUnit.MILLISECONDS),
    refreshTime: FiniteDuration = FiniteDuration(5000L, TimeUnit.MILLISECONDS)
  )

  final case class Internal(
    cacheStorageLevel: StorageLevel,
    intermediateBigqueryFormat: String,
    temporaryGcsBucket: Option[String],
    substituteVars: Boolean = true,
    bqAuditSaveInBatchMode: Boolean = true
  )

  final case class KafkaTopicConfig(
    topicName: String,
    maxRead: Long = -1,
    fields: List[String] = List("key as STRING", "value as STRING"),
    partitions: Int = 1,
    replicationFactor: Short = 1,
    createOptions: Map[String, String] = Map.empty,
    accessOptions: Map[String, String] = Map.empty,
    headers: Map[String, Map[String, String]] = Map.empty
  ) {
    def allAccessOptions()(implicit settings: Settings): Map[String, String] = {
      settings.appConfig.kafka.sparkServerOptions ++ accessOptions
    }
  }

  @JsonIgnoreProperties(Array("sparkServerOptions"))
  final case class KafkaConfig(
    serverOptions: Map[String, String],
    topics: Map[String, KafkaTopicConfig],
    cometOffsetsMode: Option[String] = Some("STREAM"),
    customDeserializers: Option[Map[String, String]]
  ) {
    lazy val sparkServerOptions: Map[String, String] = {
      val ASSIGN = "assign"
      val SUBSCRIBE_PATTERN = "subscribepattern"
      val SUBSCRIBE = "subscribe"
      val ignoreKafkaProperties = List(
        ConsumerConfig.GROUP_ID_CONFIG,
        ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,
        ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
        ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
        ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,
        ConsumerConfig.INTERCEPTOR_CLASSES_CONFIG,
        ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
        ASSIGN,
        SUBSCRIBE_PATTERN,
        SUBSCRIBE
      )
      val kafkaServerProperties = new Properties()
      serverOptions.foreach { case (k, v) =>
        // for spark we need to prefix them with "kafka."
        kafkaServerProperties.put(k, v)
        if (!ignoreKafkaProperties.contains(k) && !k.startsWith("kafka."))
          kafkaServerProperties.put(s"kafka.$k", v)
      }
      kafkaServerProperties.asScala.toMap
    }
  }

  case class SparkScheduling(
    maxJobs: Int,
    poolName: String,
    mode: String,
    file: String
  )

  case class DagRef(load: Option[String], transform: Option[String])

  case class AccessPolicies(apply: Boolean, location: String, database: String, taxonomy: String)

  /** @param datasets
    *   : Absolute path, datasets root folder beneath which each area is defined.
    * @param metadata
    *   : Absolute path, location where all types / domains and auto jobs are defined
    * @param metrics
    *   : Absolute path, location where all computed metrics are stored
    * @param audit
    *   : Absolute path, location where all log are stored
    * @param archive
    *   : Should we backup the ingested datasets ? true by default
    * @param defaultWriteFormat
    *   : Choose between parquet, orc ... Default is parquet
    * @param defaultRejectedWriteFormat
    *   : Writing format for rejected datasets, choose between parquet, orc ... Default is parquet
    * @param defaultAuditWriteFormat
    *   : Writing format for audit datasets, choose between parquet, orc ... Default is parquet
    * @param hive
    *   : Should we create a Hive Table ? true by default
    * @param area
    *   : see Area above
    */
  @JsonIgnoreProperties(Array("cacheStorageLevel"))
  final case class AppConfig(
    env: String,
    datasets: String,
    dags: String,
    tests: String,
    writeStrategies: String,
    metadata: String,
    metrics: Metrics,
    validateOnLoad: Boolean,
    audit: Audit,
    archive: Boolean,
    sinkReplayToFile: Boolean,
    lock: Lock,
    defaultWriteFormat: String,
    defaultRejectedWriteFormat: String,
    defaultAuditWriteFormat: String,
    csvOutput: Boolean,
    csvOutputExt: String,
    privacyOnly: Boolean,
    emptyIsNull: Boolean,
    loader: String,
    rowValidatorClass: String,
    treeValidatorClass: String,
    loadStrategyClass: String,
    grouped: Boolean,
    groupedMax: Int,
    scd2StartTimestamp: String,
    scd2EndTimestamp: String,
    area: Area,
    hadoop: Map[String, String],
    connections: Map[String, Connection],
    jdbcEngines: Map[String, JdbcEngine],
    privacy: Privacy,
    root: String,
    internal: Option[Internal],
    accessPolicies: AccessPolicies,
    sparkScheduling: SparkScheduling,
    udfs: Option[String],
    expectations: ExpectationsConfig,
    sqlParameterPattern: String,
    rejectAllOnError: Boolean,
    rejectMaxRecords: Int,
    maxParCopy: Int,
    kafka: KafkaConfig,
    dsvOptions: Map[String, String],
    forceViewPattern: String,
    forceDomainPattern: String,
    forceTablePattern: String,
    forceJobPattern: String,
    forceTaskPattern: String,
    useLocalFileSystem: Boolean,
    sessionDurationServe: Long,
    database: String,
    tenant: String,
    connectionRef: String,
    schedulePresets: Map[String, String],
    maxParTask: Int,
    refs: List[Ref],
    dagRef: Option[DagRef],
    forceHalt: Boolean,
    jobIdEnvName: Option[String],
    archiveTablePattern: String,
    archiveTable: Boolean,
    version: String,
    autoExportSchema: Boolean,
    longJobTimeoutMs: Long,
    shortJobTimeoutMs: Long,
    createSchemaIfNotExists: Boolean,
    http: Http,
    timezone: TimeZone,
    hiveInTest: Boolean,
    duckdbMode: Boolean,
    testCsvNullString: String,
    maxInteractiveRecords: Int
    // createTableIfNotExists: Boolean
  ) extends Serializable {

    @JsonIgnore
    def getEffectiveUdfs(): Seq[String] =
      udfs
        .map { udfs =>
          udfs.split(',').toList
        }
        .getOrElse(Nil)
        .filter(_.nonEmpty)

    @JsonIgnore
    lazy val fileSystem: String = {
      val protocolSeparator = "://"
      if (root.matches("^\\w+?:\\/\\/.*")) { // check if it follows URI pattern
        val uri = new URI(root)
        uri.getScheme match {
          case "file" => uri.getScheme + protocolSeparator
          case scheme =>
            // get bucket name
            val bucketName =
              root.substring(scheme.length + protocolSeparator.length).takeWhile(_ != '/')
            s"$scheme$protocolSeparator$bucketName"
        }
      } else {
        s"file$protocolSeparator"
      }
    }

    @JsonIgnore
    def getConnection(connectionRef: String): Connection = {
      connections.getOrElse(
        connectionRef,
        throw new Exception(
          s"Connection $connectionRef not found. Please check your connection definition."
        )
      )
    }
    @JsonIgnore
    def getDefaultConnection(): Connection = {
      connections.getOrElse(
        this.connectionRef,
        throw new Exception(
          s"Connection $connectionRef not found. Please check your connection definition."
        )
      )
    }

    @JsonIgnore
    def getDefaultDatabase(): Option[String] = if (database.isEmpty) None else Some(database)

    val cacheStorageLevel: StorageLevel =
      internal.map(_.cacheStorageLevel).getOrElse(StorageLevel.MEMORY_AND_DISK)

    // config.getOption("hive.metastore.uris")
    @JsonIgnore
    def isHiveCompatible(): Boolean = {
      val connectionTypeIsHive = this.connections
        .get(this.connectionRef)
        .exists { conn =>
          conn.`type` == ConnectionType.FS // && session.conf.getAll.contains("hive.metastore.uris")
        }
      connectionTypeIsHive || Utils.isRunningInDatabricks()
    }

    @JsonIgnore
    def connection(name: String): Option[Connection] = connections.get(name)

    @JsonIgnore
    def connectionOptions(name: String): Map[String, String] =
      connections(name).options

  }

  object AppConfig {
    def checkValidity(
      storageHandler: StorageHandler,
      settings: Settings
    ): List[ValidationMessage] = {
      var errors = List.empty[ValidationMessage]
      val appConfig = settings.appConfig
      if (appConfig.env.nonEmpty && appConfig.env != "None") {
        val envFile = new Path(
          DatasetArea.metadata(settings),
          "env." + appConfig.env + ".sl.yml"
        )
        if (!storageHandler.exists(envFile)) {
          errors = errors :+ ValidationMessage(
            Severity.Error,
            "AppConfig",
            s"${envFile.getName()} not found !!!"
          )
        }
      }
      Try {
        Utils
          .loadInstance[GenericRowValidator](settings.appConfig.rowValidatorClass)
      } match {
        case scala.util.Failure(exception) =>
          errors = errors :+ ValidationMessage(
            Severity.Error,
            "AppConfig",
            s"rowValidatorClass ${settings.appConfig.rowValidatorClass} not found"
          )
        case _ =>
      }

      Try {
        Utils
          .loadInstance[GenericRowValidator](settings.appConfig.treeValidatorClass)
      } match {
        case scala.util.Failure(exception) =>
          errors = errors :+ ValidationMessage(
            Severity.Error,
            "AppConfig",
            s"treeValidatorClass ${settings.appConfig.treeValidatorClass} not found"
          )
        case _ =>
      }
      Try {
        Utils
          .loadInstance[LoadStrategy](settings.appConfig.loadStrategyClass)
      } match {
        case scala.util.Failure(exception) =>
          exception.printStackTrace()
          errors = errors :+ ValidationMessage(
            Severity.Error,
            "AppConfig",
            s"loadStrategyClass ${settings.appConfig.loadStrategyClass} not found"
          )
        case _ =>
      }

      if (!Set("spark", "native").contains(settings.appConfig.loader)) {
        errors = errors :+ ValidationMessage(
          Severity.Error,
          "AppConfig",
          s"loader ${settings.appConfig.loader} not supported"
        )
      }
      settings.appConfig.connections.foreach { case (name, connection) =>
        errors = errors ++ connection.checkValidity()(settings)
      }
      val path = new Path(settings.appConfig.root)
      if (!storageHandler.exists(path)) {
        errors = errors :+ ValidationMessage(
          Severity.Error,
          "AppConfig",
          s"root ${settings.appConfig.root} not found"
        )
      }
      Try {
        settings.appConfig.sqlParameterPattern.r
      } match {
        case scala.util.Failure(exception) =>
          errors = errors :+ ValidationMessage(
            Severity.Error,
            "AppConfig",
            s"sqlParameterPattern ${settings.appConfig.sqlParameterPattern} is not a valid regex"
          )
        case _ =>
      }
      if (settings.appConfig.rejectMaxRecords < 0) {
        errors = errors :+ ValidationMessage(
          Severity.Error,
          "AppConfig",
          s"rejectMaxRecords ${settings.appConfig.rejectMaxRecords} must be positive"
        )
      }
      if (settings.appConfig.maxParCopy <= 0) {
        errors = errors :+ ValidationMessage(
          Severity.Error,
          "AppConfig",
          s"maxParCopy ${settings.appConfig.maxParCopy} must be positive"
        )
      }
      val patterns = List(
        (settings.appConfig.forceViewPattern, "forceViewPattern"),
        (settings.appConfig.forceDomainPattern, "forceDomainPattern"),
        (settings.appConfig.forceTablePattern, "forceTablePattern"),
        (settings.appConfig.forceJobPattern, "forceJobPattern"),
        settings.appConfig.forceTaskPattern -> "forceTaskPattern"
      )
      patterns.foreach { case (value, name) =>
        Try {
          value.r
        } match {
          case Failure(exception) =>
            errors = errors :+ ValidationMessage(
              Severity.Error,
              "AppConfig",
              s"$name value is not a valid regex"
            )
          case _ =>
        }
      }
      if (settings.appConfig.sessionDurationServe <= 0) {
        errors = errors :+ ValidationMessage(
          Severity.Error,
          "AppConfig",
          s"sessionDurationServe ${settings.appConfig.sessionDurationServe} must be positive"
        )
      }

      if (settings.appConfig.connections.isEmpty) {
        errors = errors :+ ValidationMessage(
          Severity.Error,
          "AppConfig",
          s"connections: at least one connection must be defined"
        )
      }

      val validConnectionNames = settings.appConfig.connections.keys.mkString(", ")

      if (settings.appConfig.connectionRef.isEmpty) {
        val msg =
          if (settings.appConfig.connections.isEmpty)
            s"connectionRef must be defined. Define a connection first and set it to this newly defined connection"
          else
            s"connectionRef must be defined. Valid connection names are $validConnectionNames"
        errors = errors :+ ValidationMessage(Severity.Error, "AppConfig", msg)
      } else {
        settings.appConfig.connections.get(settings.appConfig.connectionRef) match {
          case Some(_) =>
          case None =>
            errors = errors :+ ValidationMessage(
              Severity.Error,
              "AppConfig",
              s"Connection ${settings.appConfig.connectionRef} not found. Valid connection names are $validConnectionNames"
            )
        }
      }

      settings.appConfig.schedulePresets.foreach { case (name, cron) =>
        Try {
          cron.r
        } match {
          case Failure(exception) =>
            errors = errors :+ ValidationMessage(
              Severity.Error,
              "AppConfig",
              s"schedulePresets $name value is not a valid cron (${exception.getMessage})"
            )
          case _ =>
        }
      }

      val dagRef: List[String] =
        settings.appConfig.dagRef
          .map(ref => List(ref.load.toList, ref.transform.toList).flatten)
          .getOrElse(Nil)

      val dagTemplateLoader = new Yml2DagTemplateLoader()
      dagRef.foreach { dagRef =>
        val dagConfigRef = if (dagRef.endsWith(".yml")) dagRef else dagRef + ".sl.yml"
        val dagConfigPath = new Path(DatasetArea.dags(settings), dagConfigRef)
        if (!storageHandler.exists(dagConfigPath)) {
          errors = errors :+ ValidationMessage(
            Severity.Error,
            "AppConfig",
            s"dagConfigRef $dagConfigRef not found in ${dagConfigPath.getParent}"
          )
        }
      }
      errors
    }

    private case class JsonWrapped(jsonValue: String) {

      @throws(classOf[ObjectStreamException])
      protected def readResolve: AnyRef = {
        val unwrapped = JsonWrapped.jsonMapper.readValue(jsonValue, classOf[AppConfig])
        unwrapped
      }
    }

    private object JsonWrapped {
      private def jsonMapper: ObjectMapper = new StarlakeObjectMapper()

      def apply(comet: AppConfig): JsonWrapped = {
        val writer = jsonMapper.writerFor(classOf[AppConfig])
        val asJson = writer.writeValueAsString(comet)
        JsonWrapped(asJson)
      }
    }
  }

  implicit val sinkHint: FieldCoproductHint[Sink] = new FieldCoproductHint[Sink]("type") {
    override def fieldValue(name: String) = name
  }

  implicit val connectionTypeReader: ConfigReader[ConnectionType] =
    ConfigReader.fromString[ConnectionType](catchReadError(ConnectionType.fromString))

  implicit val storageLevelReader: ConfigReader[StorageLevel] =
    ConfigReader.fromString[StorageLevel](catchReadError(StorageLevel.fromString))

  implicit val timezoneReader: ConfigReader[TimeZone] =
    ConfigReader.fromString[TimeZone](catchReadError(TimeZone.getTimeZone))

  def loadConf(conf: Option[Config] = None): AppConfig = {
    ConfigSource
      .fromConfig(conf.getOrElse(referenceConfig))
      .loadOrThrow[AppConfig]
  }

  /** @param config
    *   : usually the default configuration loaded from reference.conf except in tests
    * @return
    *   final configuration after merging with application.conf & application. sl.yml
    */

  def apply(
    config: Config,
    env: Option[String],
    root: Option[String]
  ): Settings = {
    val jobId = UUID.randomUUID().toString
    val effectiveConfig =
      config.withValue("job-id", ConfigValueFactory.fromAnyRef(jobId, "per JVM instance"))

    // Load reference.conf
    val loadedConfig = loadConf(Some(effectiveConfig))
    val withRootUpdatedConfig =
      root
        .map { root =>
          val oldRootLength = loadedConfig.root.length
          loadedConfig.copy(
            root = root,
            audit = loadedConfig.audit
              .copy(path = root + loadedConfig.audit.path.substring(oldRootLength)),
            expectations = loadedConfig.expectations
              .copy(path = root + loadedConfig.expectations.path.substring(oldRootLength)),
            datasets = root + loadedConfig.datasets.substring(oldRootLength),
            metadata = root + loadedConfig.metadata.substring(oldRootLength),
            lock =
              loadedConfig.lock.copy(path = root + loadedConfig.lock.path.substring(oldRootLength)),
            metrics = loadedConfig.metrics
              .copy(path = root + loadedConfig.metrics.path.substring(oldRootLength)),
            dags = root + loadedConfig.dags.substring(oldRootLength),
            writeStrategies = root + loadedConfig.writeStrategies.substring(oldRootLength)
          )
        }
        .getOrElse(loadedConfig)

    val withEnvUpdatedEnvConfig =
      env
        .orElse(Option(System.getenv("SL_ENV")))
        .map(env => withRootUpdatedConfig.copy(env = env))
        .getOrElse(withRootUpdatedConfig)

    logger.info(
      "root=" + config.getString("root")
    )
    logger.info(
      "ENV SL_ROOT=" + Option(System.getenv("SL_ROOT")).getOrElse("")
    )

    val rootUpdatedEffectiveConfig =
      effectiveConfig
        .withValue("root", ConfigValueFactory.fromAnyRef(root.getOrElse(System.getenv("SL_ROOT"))))
        .withValue("audit.path", ConfigValueFactory.fromAnyRef(withRootUpdatedConfig.audit.path))
        .withValue(
          "expectations.path",
          ConfigValueFactory.fromAnyRef(withEnvUpdatedEnvConfig.expectations.path)
        )
        .withValue("datasets", ConfigValueFactory.fromAnyRef(withEnvUpdatedEnvConfig.datasets))
        .withValue("metadata", ConfigValueFactory.fromAnyRef(withEnvUpdatedEnvConfig.metadata))
        .withValue("lock.path", ConfigValueFactory.fromAnyRef(withEnvUpdatedEnvConfig.lock.path))
        .withValue(
          "metrics.path",
          ConfigValueFactory.fromAnyRef(withEnvUpdatedEnvConfig.metrics.path)
        )
        .withValue("dags", ConfigValueFactory.fromAnyRef(withEnvUpdatedEnvConfig.dags))
        .withValue(
          "writeStrategies",
          ConfigValueFactory.fromAnyRef(withEnvUpdatedEnvConfig.writeStrategies)
        )

    val withUpdatedEnvConfig =
      env
        .orElse(Option(System.getenv("SL_ENV")))
        .map(env => rootUpdatedEffectiveConfig.withValue("env", ConfigValueFactory.fromAnyRef(env)))
        .getOrElse(rootUpdatedEffectiveConfig)

    logger.debug(YamlSerde.serialize(withEnvUpdatedEnvConfig))
    val settings =
      Settings(
        withEnvUpdatedEnvConfig,
        effectiveConfig.getConfig("spark"),
        effectiveConfig.getConfig("extra")
      )
    // Load application.conf / application.sl.yml
    val loadedSettings =
      loadApplicationYaml(withUpdatedEnvConfig, settings, env, root)
        .orElse(loadApplicationConf(withUpdatedEnvConfig, settings, env))
        .getOrElse(settings)

    val applicationConfSettings =
      if (settings.appConfig.duckdbMode) duckDBMode(loadedSettings)
      else {
        adjustDuckDBProperties(loadedSettings)
      }

    // Reload Storage Handler with the authentication settings
    applicationConfSettings.storageHandler(reload = true)

    // Load fairscheduler.xml
    val jobConf = initSparkConfig(applicationConfSettings)
    val withSparkConfig = applicationConfSettings.copy(jobConf = jobConf)
    val withDefaultSchdules = addDefaultSchedules(withSparkConfig)
    withDefaultSchdules
  }

  val defaultCronPresets = Map(
    "hourly"  -> "0 * * * *",
    "daily"   -> "0 0 * * *",
    "weekly"  -> "0 0 * * 1",
    "monthly" -> "0 0 1 * *",
    "yearly"  -> "0 0 1 1 *"
  )

  def addDefaultSchedules(settings: Settings): Settings = {
    val schedules = settings.appConfig.schedulePresets ++ defaultCronPresets
    settings.copy(appConfig = settings.appConfig.copy(schedulePresets = schedules))
  }

  private def loadApplicationConf(
    effectiveConfig: Config,
    settings: Settings,
    env: Option[String] = None
  ): Option[Settings] = {
    val applicationConfPath = new Path(DatasetArea.metadata(settings), "application.conf")
    if (settings.storageHandler().exists(applicationConfPath)) {
      logger.info(s"Loading $applicationConfPath")
      val applicationConfContent = settings.storageHandler().read(applicationConfPath)
      val applicationConfig = ConfigFactory.parseString(applicationConfContent).resolve()
      val effectiveApplicationConfig = applicationConfig
        .withFallback(effectiveConfig)
      logger.debug(effectiveApplicationConfig.toString)
      val mergedSettings = ConfigSource
        .fromConfig(effectiveApplicationConfig)
        .loadOrThrow[AppConfig]

      Some(
        Settings(
          mergedSettings,
          effectiveApplicationConfig.getConfig("spark"),
          effectiveApplicationConfig.getConfig("extra")
        )
      )
    } else {
      None
    }
  }

  /** Load application.sl.yml from metadata folder
    * @param effectiveConfig:
    *   config to merge with application.sl.yml
    * @param settings
    *   :
    * @return
    */
  private def loadApplicationYaml(
    effectiveConfig: Config,
    settings: Settings,
    env: Option[String],
    root: Option[String]
  ): Option[Settings] = {
    val applicationYmlPath =
      new Path(DatasetArea.metadata(settings), "application.sl.yml")
    val applicationYmlConfig =
      if (settings.storageHandler().exists(applicationYmlPath)) {
        logger.info(s"Loading $applicationYmlPath")
        val schemaHandler = settings.schemaHandler()
        val applicationYmlContent = settings.storageHandler().read(applicationYmlPath)
        val content =
          Try {
            val vars = schemaHandler.activeEnvVars(reload = true, env, root)
            val varsWithRoot = {
              root match {
                case Some(root) =>
                  vars + ("SL_ROOT" -> root)
                case None =>
                  val slRoot = Option(System.getenv("SL_ROOT"))
                    .getOrElse(throw new Exception("SL_ROOT not defined"))
                  vars + ("SL_ROOT" -> slRoot)
              }
            }

            Utils
              .parseJinja(applicationYmlContent, varsWithRoot)(
                settings
              )
          } match {
            case Success(value) => value
            case Failure(exception) =>
              throw new Exception(
                s"Error while parsing Jinja in ${applicationYmlPath.toString}",
                exception
              )
          }
        val finalNode: JsonNode =
          YamlSerde
            .deserializeYamlApplication(content, applicationYmlPath.toString)
            .path("application")
        val jsonString = Utils.newJsonMapper().writeValueAsString(finalNode)
        val applicationConfig = ConfigFactory.parseString(jsonString).resolve()
        Some(applicationConfig)
      } else {
        None
      }

    val applicationSettings = applicationYmlConfig match {
      case Some(applicationConfig) =>
        val effectiveApplicationConfig = applicationConfig
          .withFallback(effectiveConfig)
        logger.debug(effectiveApplicationConfig.toString)

        val mergedSettings = loadConf(Some(effectiveApplicationConfig))

        val applicationSettings = Settings(
          mergedSettings,
          effectiveApplicationConfig.getConfig("spark"),
          effectiveApplicationConfig.getConfig("extra")
        )
        Some(applicationSettings)
      case None =>
        None
    }
    applicationSettings.foreach(_.storageHandler(true)) // Reload with the authentication settings
    applicationSettings
  }

  private def initSparkConfig(settings: Settings): SparkConf = {
    val schedulingConfig = schedulingPath(settings)

    // When using local Spark with remote BigQuery (useful for testing)
    val initialConf =
      settings.appConfig.internal.flatMap(_.temporaryGcsBucket) match {
        case Some(value) => new SparkConf().set("temporaryGcsBucket", value)
        case None        => new SparkConf()
      }

    val thisConf = settings.sparkConfig
      .entrySet()
      .asScala
      .toVector
      .map(x => (x.getKey, x.getValue.unwrapped().toString))
      .foldLeft(initialConf) { case (conf, (key, value)) =>
        logger.debug(s"Setting key: ${key}")
        conf.set("spark." + key, value)
      }
      .set("spark.scheduler.mode", settings.appConfig.sparkScheduling.mode)

    schedulingConfig.foreach(path => thisConf.set("spark.scheduler.allocation.file", path.toString))

    logger.whenDebugEnabled {
      logger.debug(thisConf.toDebugString)
    }
    thisConf
  }

  private def schedulingPath(settings: Settings): Option[Path] = {
    import settings.appConfig.sparkScheduling._
    if (file.isEmpty) {
      val schedulingPath = new Path(DatasetArea.metadata(settings), "fairscheduler.xml")
      Some(schedulingPath).filter(settings.storageHandler().exists)
    } else
      Some(new Path(file))
  }

  def adjustDuckDBProperties(settings: Settings): Settings = {
    val connections = settings.appConfig.connections
    val updatedConnections =
      connections.map { case (name, connection) =>
        val updatedConnection =
          if (connection.isDuckDb())
            connection.copy(sparkFormat = None) // spark mode not supported in duckdb
          else connection
        name -> updatedConnection
      }
    val updatedAppConfig = settings.appConfig.copy(connections = updatedConnections)
    settings.copy(appConfig = updatedAppConfig)
  }

  def duckDBMode(settings: Settings): Settings = {
    val duckdbPath = DatasetArea.path("duckdb.db")(settings)
    val pathAsString = duckdbPath.toUri.getPath
    val duckDBConnection = Connection(
      `type` = ConnectionType.JDBC,
      sparkFormat = None,
      options = Map(
        "url"    -> s"jdbc:duckdb:$pathAsString",
        "driver" -> "org.duckdb.DuckDBDriver"
      )
    )
    val duckdbFile = File(pathAsString)
    if (!duckdbFile.exists) {
      if (!duckdbFile.parent.exists)
        duckdbFile.parent.createDirectories()
      Utils.withResources(DriverManager.getConnection(duckDBConnection.jdbcUrl)) { _ => }
    }
    val updatedConnections = settings.appConfig.connections
      .map { case (k, v) =>
        val duckDBConnectionWithTranspileInfo = SQLUtils.transpilerDialect(v) match {
          case JSQLTranspiler.Dialect.DUCK_DB =>
            duckDBConnection.copy(_transpileDialect = None)
          case dialect =>
            duckDBConnection.copy(_transpileDialect = Some(dialect.name()))
        }

        k -> duckDBConnectionWithTranspileInfo
      }
      .updated("duckdb", duckDBConnection)

    val audit = settings.appConfig.audit.copy(database = None)
    val updatedAppConfig = settings.appConfig.copy(connections = updatedConnections)
    val configWithDuckDB =
      if (updatedAppConfig.connectionRef.isEmpty)
        updatedAppConfig.copy(connectionRef = "duckdb", database = "", audit = audit)
      else
        updatedAppConfig

    settings.copy(appConfig = configWithDuckDB)

  }
}

object CometColumns {
  val cometInputFileNameColumn: String = "comet_input_file_name"
  val slSuccessColumn: String = "sl_success"
  val slErrorMessageColumn: String = "sl_error_message"
}

final case class ApplicationDesc(version: Int, application: Settings.AppConfig)

/** This class holds the current Comet settings and an assembly of reference instances for core,
  * shared services
  *
  * SMELL: this may be the start of a Dependency Injection root (but at 2-3 objects, is DI
  * justified? probably not quite yet) — cchepelov
  */
final case class Settings(
  appConfig: Settings.AppConfig,
  sparkConfig: Config,
  extraConf: Config,
  jobConf: SparkConf = new SparkConf(),
  created: Long = System.currentTimeMillis()
) {

  var _schemaHandler: Option[SchemaHandler] = None
  @transient
  def schemaHandler(
    cliEnv: Map[String, String] = Map.empty,
    reload: Boolean = false
  ): SchemaHandler = {
    _schemaHandler match {
      case Some(handler) if !reload => handler
      case _ =>
        implicit val self: Settings = this
        val handler = new SchemaHandler(this.storageHandler(), cliEnv)
        _schemaHandler = Some(handler)
        handler
    }
  }

  var _storageHandler: Option[StorageHandler] = None
  @transient
  def getWarehouseDir(): Option[String] = if (this.sparkConfig.hasPath("sql.warehouse.dir"))
    Some(this.sparkConfig.getString("sql.warehouse.dir"))
  else None

  @transient
  def storageHandler(reload: Boolean = false): StorageHandler = {
    _storageHandler match {
      case Some(handler) if !reload => handler
      case _ =>
        implicit val self: Settings = this
        val handler =
          if (SystemUtils.IS_OS_WINDOWS || appConfig.useLocalFileSystem)
            new LocalStorageHandler()
          else
            new HdfsStorageHandler(appConfig.fileSystem)
        _storageHandler = Some(handler)
        handler
    }
  }

}

object PrivacyLevels {
  private def make(schemeName: String, encryptionAlgo: String): (TransformEngine, List[String]) = {
    val (privacyObject, typedParams) = TransformEngine.parse(encryptionAlgo)
    val encryption = Utils.loadInstance[TransformEngine](privacyObject)
    (encryption, typedParams)
  }

  private var allPrivacy = Map.empty[String, ((TransformEngine, List[String]), TransformInput)]

  def resetAllPrivacy(): Unit =
    allPrivacy = Map.empty[String, ((TransformEngine, List[String]), TransformInput)]

  @transient
  def allPrivacyLevels(
    options: Map[String, String]
  ): Map[String, ((TransformEngine, List[String]), TransformInput)] = {
    if (allPrivacy.isEmpty) {
      allPrivacy = options.map { case (k, objName) =>
        val encryption = make(k, objName)
        val key = k.toUpperCase(Locale.ROOT)
        (key, (encryption, new TransformInput(key, false)))
      }
    }
    allPrivacy
  }

  def traverse(config: AppConfig): Unit = {
    val jsonNode = YamlSerde.mapper.valueToTree(config).asInstanceOf[JsonNode]
    traverse(jsonNode, jsonNode, "")
  }

  def traverse(refNode: JsonNode, incomingNode: JsonNode, keyPrefix: String): Unit = {
    val itRef = refNode.fields().asScala.toList.sortBy(_.getKey).iterator
    val itIncoming = incomingNode.fields().asScala.toList.sortBy(_.getKey).iterator
    while (itRef.hasNext) {
      val refField = itRef.next()
      val refKey = refField.getKey
      val refValue = refField.getValue
      val incomingField = itIncoming.next()
      val incomingKey = incomingField.getKey
      val incomingValue = incomingField.getValue

      if (refValue.isObject) {
        traverse(refValue, incomingValue, s"$keyPrefix$refKey.")
      } else {
        val refText = refValue.asText()
        val incomingText = incomingValue.asText()
        if (incomingText != refText)
          println(s"$keyPrefix$refKey = $incomingText")
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy