All Downloads are FREE. Search and download functionalities are using the official Maven repository.

za.co.absa.enceladus.conformance.HyperConformance.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.enceladus.conformance

import java.text.SimpleDateFormat
import java.util.Date
import org.apache.commons.configuration2.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.SPARK_VERSION
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import za.co.absa.commons.version.Version.VersionStringInterpolator

import scala.collection.JavaConverters._
import za.co.absa.enceladus.common.Constants._
import za.co.absa.spark.commons.SparkVersionGuard
import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{Always, DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.conformance.streaming.{InfoDateFactory, InfoVersionFactory}
import za.co.absa.enceladus.dao.EnceladusDAO
import za.co.absa.enceladus.dao.OptionallyRetryableException._
import za.co.absa.enceladus.dao.auth.{RestApiCredentialsFactory, RestApiKerberosCredentialsFactory, RestApiPlainCredentialsFactory}
import za.co.absa.enceladus.dao.rest.RestDaoFactory.AvailabilitySetup
import za.co.absa.enceladus.dao.rest.RestDaoFactory
import za.co.absa.enceladus.model.{ConformedSchema, Dataset}
import za.co.absa.enceladus.utils.config.UrisConnectionStringParser
import za.co.absa.enceladus.utils.fs.HadoopFsUtils
import za.co.absa.enceladus.utils.validation.ValidationLevel
import za.co.absa.hyperdrive.ingestor.api.transformer.{StreamTransformer, StreamTransformerFactory}

class HyperConformance (restApiBaseUrls: List[String],
                        urlsRetryCount: Option[Int] = None,
                        restApiAvailabilitySetup: Option[String] = None,
                        optionallyRetryableExceptions: Set[OptRetryableExceptions] = Set.empty)
                       (implicit cmd: ConformanceConfig,
                        featureSwitches: FeatureSwitches,
                        infoDateFactory: InfoDateFactory,
                        infoVersionFactory: InfoVersionFactory) extends StreamTransformer {
  val log: Logger = LoggerFactory.getLogger(this.getClass)

  @throws[IllegalArgumentException]
  def transform(rawDf: DataFrame): DataFrame = {
    implicit val spark: SparkSession = rawDf.sparkSession
    val restApiCredentials = cmd.restApiCredentialsFactory.getInstance()

    val restApiAvailabilitySetupValue = restApiAvailabilitySetup
      .map(AvailabilitySetup.withName).getOrElse(RestDaoFactory.DefaultAvailabilitySetup)
    implicit val dao: EnceladusDAO = RestDaoFactory.getInstance(
      restApiCredentials,
      restApiBaseUrls,
      urlsRetryCount,
      restApiAvailabilitySetupValue,
      optionallyRetryableExceptions)
    dao.authenticate()

    logPreConformanceInfo(rawDf)

    val conformance = dao.getDataset(cmd.datasetName, cmd.datasetVersion, ValidationLevel.ForRun)
    val conformedDf = applyConformanceTransformations(rawDf, conformance)
    log.info(s"Raw schema: ${rawDf.schema.treeString}")
    log.info(s"Publish schema: ${conformedDf.schema.treeString}")
    conformedDf
  }

  def applyConformanceTransformations(rawDf: DataFrame, conformance: Dataset)
                                     (implicit sparkSession: SparkSession, enceladusDAO: EnceladusDAO): DataFrame = {
    sparkSession.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY") // otherwise timestamp parsing migh cause issues

    val schema: StructType = enceladusDAO.getSchema(conformance.schemaName, conformance.schemaVersion)
    val schemaFields = if (schema == null) List() else schema.fields.toList
    val conformedSchema = ConformedSchema(schemaFields, conformance)
    val infoDateColumn = infoDateFactory.getInfoDateColumn(rawDf)
    val infoVersionColumn = infoVersionFactory.getInfoVersionColumn(conformedSchema)

    // using HDFS implementation until HyperConformance is S3-ready
    implicit val hdfs: FileSystem = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)
    implicit val hdfsUtils: HadoopFsUtils = HadoopFsUtils.getOrCreate(hdfs)
    val dataFormat = coalesce(date_format(infoDateColumn, "yyyy-MM-dd"), lit(""))
    val currentDateColumn = current_date()
    import za.co.absa.enceladus.utils.schema.SparkUtils.DataFrameWithEnhancements
    val conformedDf = DynamicInterpreter().interpret(conformance, rawDf)
      .withColumnOverwriteIfExists(InfoDateColumn, coalesce(infoDateColumn, currentDateColumn))
      .withColumnOverwriteIfExists(InfoDateColumnString, dataFormat)
      .withColumnOverwriteIfExists(InfoVersionColumn, infoVersionColumn)
    conformedDf
  }

  private def logPreConformanceInfo(streamData: DataFrame): Unit = {
    log.info(s"REST API URLs: ${restApiBaseUrls.mkString(",")}, dataset=${cmd.datasetName}, version=${cmd.datasetVersion}")
    log.info(s"Input schema: ${streamData.schema.prettyJson}")
  }
}

/**
 * This is the definition of Dynamic Conformance as a component of Hyperdrive.
 *
 * See https://github.com/AbsaOSS/hyperdrive#configuration for instructions how the component needs to be configured in Hyperdrive.
 * Example values are given below:
 * {{{
 * enceladus.rest.uri=http://localhost:8080
 * dataset.name=example
 * dataset.version=1
 * report.date=2020-01-29
 * report.version=1
 * event.timestamp.column=EV_TIME
 *
 * # Either plain credentials
 * enceladus.rest.credentials.file=/path/rest_api.credentials
 *
 * # Or a keytab
 * enceladus.rest.auth.keytab=/path/to/keytab
 * }}}
 */
object HyperConformance extends StreamTransformerFactory with HyperConformanceAttributes {
  import HyperConformanceAttributes._

  val log: Logger = LoggerFactory.getLogger(this.getClass)

  private val defaultReportVersion = 1

  @throws[IllegalArgumentException]
  override def apply(conf: Configuration): StreamTransformer = {
    log.info("Building HyperConformance")

    SparkVersionGuard.fromSpark3XCompatibilitySettings.copy(minVersionInclusive = semver"3.2.1")(log)
      .ensureSparkVersionCompatibility(SPARK_VERSION)

    validateConfiguration(conf)

    val restApiCredentialsFactory = getRestApiCredentialsFactory(conf: Configuration)

    implicit val confConfig: ConformanceConfig = ConformanceConfig(publishPathOverride = None,
      experimentalMappingRule = Some(true),
      isCatalystWorkaroundEnabled = Some(true),
      autocleanStandardizedFolder = Some(false),
      datasetName = conf.getString(datasetNameKey),
      datasetVersion = conf.getInt(datasetVersionKey),
      reportDate = new SimpleDateFormat(ReportDateFormat).format(new Date()),  // Still need a report date for mapping table patterns
      reportVersion = Option(getReportVersion(conf)),
      performanceMetricsFile = None,
      folderPrefix = None,
      persistStorageLevel = None,
      restApiCredentialsFactory = restApiCredentialsFactory
    )

    implicit val featureSwitcher: FeatureSwitches = FeatureSwitches()
      .setExperimentalMappingRuleEnabled(true)
      .setCatalystWorkaroundEnabled(true)
      .setControlFrameworkEnabled(false)
      .setBroadcastStrategyMode(Always)
      .setBroadcastMaxSizeMb(0)
      .setErrColNullability(true)

    implicit val reportDateCol: InfoDateFactory = InfoDateFactory.getFactoryFromConfig(conf)
    implicit val infoVersionCol: InfoVersionFactory = InfoVersionFactory.getFactoryFromConfig(conf)

    val restApiBaseUrls: List[String] = UrisConnectionStringParser.parse(conf.getString(restApiUriKey))
    val restApiUrlsRetryCount: Option[Int] = if (conf.containsKey(restApiUriRetryCountKey)) {
      Option(conf.getInt(restApiUriRetryCountKey))
    } else {
      None
    }
    val restApiAvailabilitySetup: Option[String] = if (conf.containsKey(restApiAvailabilitySetupKey)) {
      Option(conf.getString(restApiAvailabilitySetupKey))
    } else {
      None
    }
    val optionallyRetryableExceptions: Set[OptRetryableExceptions] =
      if (conf.containsKey(restApiOptionallyRetryableExceptions)) {
        conf.getList(classOf[Int], restApiOptionallyRetryableExceptions)
          .asScala
          .toSet
          .map(getOptionallyRetryableException)
      } else {
        Set.empty
      }
    new HyperConformance(
      restApiBaseUrls, restApiUrlsRetryCount, restApiAvailabilitySetup, optionallyRetryableExceptions
    )
  }

  private def getReportVersion(conf: Configuration): Int = {
    if (conf.containsKey(reportVersionKey)) {
      conf.getInt(reportVersionKey)
    } else {
      defaultReportVersion
    }
  }

  @throws[IllegalArgumentException]
  def validateConfiguration(conf: Configuration): Unit = {
    val mandatoryKeys = List(restApiUriKey, datasetNameKey, datasetVersionKey)

    val missingKeys = mandatoryKeys.filterNot(key => conf.containsKey(key))

    if (missingKeys.nonEmpty) {
      throw new IllegalArgumentException(s"Missing mandatory configuration parameters for keys: ${missingKeys.mkString(", ")}.")
    }
  }

  @throws[IllegalArgumentException]
  private def getRestApiCredentialsFactory(conf: Configuration): RestApiCredentialsFactory = {
    val hasCredentialsFile = conf.containsKey(restApiCredentialsFileKey)
    val hasKeytab = conf.containsKey(restApiAuthKeytabKey)

    (hasCredentialsFile, hasKeytab) match {
      case (false, false) => throw new IllegalArgumentException("No authentication method is specified.")
      case (true, false)  => new RestApiPlainCredentialsFactory(conf.getString(restApiCredentialsFileKey))
      case (false, true)  => new RestApiKerberosCredentialsFactory(conf.getString(restApiAuthKeytabKey))
      case (true, true)   => throw new IllegalArgumentException("Either a credentials file or a keytab should be specified, but not both.")
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy