All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.timgent.sparkdataquality.checkssuite.ChecksSuite.scala Maven / Gradle / Ivy

package com.github.timgent.sparkdataquality.checkssuite

import java.time.Instant

import cats.implicits._
import com.github.timgent.sparkdataquality.SdqError.MetricCalculationError
import com.github.timgent.sparkdataquality.checks.ArbDualDsCheck.DatasetPair
import com.github.timgent.sparkdataquality.checks.DatasourceDescription.{DualDsDescription, SingleDsDescription}
import com.github.timgent.sparkdataquality.checks.QCCheck.{DualDsQCCheck, SingleDsCheck}
import com.github.timgent.sparkdataquality.checks._
import com.github.timgent.sparkdataquality.checks.metrics.{DualMetricCheck, SingleMetricCheck}
import com.github.timgent.sparkdataquality.metrics.{MetricDescriptor, MetricValue, MetricsCalculator}
import com.github.timgent.sparkdataquality.repository.{MetricsPersister, NullMetricsPersister, NullQcResultsRepository, QcResultsRepository}
import org.apache.spark.sql.Dataset

import scala.concurrent.{ExecutionContext, Future}

/**
  * A dataset with description
  * @param ds - the dataset
  * @param description - description of the dataset
  */
case class DescribedDs(ds: Dataset[_], description: String) {
  def datasourceDescription: SingleDsDescription = SingleDsDescription(description)
}

/**
  * A pair of [[DescribedDs]]s
  *
  * @param ds - the first described dataset
  * @param dsToCompare - the second described dataset
  */
case class DescribedDsPair(ds: DescribedDs, dsToCompare: DescribedDs) {
  def datasourceDescription: DualDsDescription = DualDsDescription(ds.description, dsToCompare.description)

  private[sparkdataquality] def rawDatasetPair = DatasetPair(ds.ds, dsToCompare.ds)
}

/**
  * Main entry point which contains the suite of checks you want to perform
  * @param checkSuiteDescription - description of the check suite
  * @param tags - any tags associated with the check suite
  * @param singleDsChecks - map from a single dataset to a list of checks on that dataset
  * @param dualDsChecks - map from a pair of datasets to a list of checks to do on that pair of datasets
  * @param arbitraryChecks - any other arbitrary checks
  * @param metricsToTrack - metrics to track (even if no checks on them)
  * @param metricsPersister - how to persist metrics
  * @param checkResultCombiner - how the overall result status should be calculated
  */
case class ChecksSuite(
    checkSuiteDescription: String,
    tags: Map[String, String] = Map.empty,
    singleDsChecks: Map[DescribedDs, Seq[SingleDsCheck]] = Map.empty,
    dualDsChecks: Map[DescribedDsPair, Seq[DualDsQCCheck]] = Map.empty,
    arbitraryChecks: Seq[ArbitraryCheck] = Seq.empty,
    metricsToTrack: Map[DescribedDs, Seq[MetricDescriptor]] = Map.empty,
    metricsPersister: MetricsPersister = NullMetricsPersister,
    qcResultsRepository: QcResultsRepository = new NullQcResultsRepository,
    checkResultCombiner: Seq[CheckResult] => CheckSuiteStatus = ChecksSuiteResultStatusCalculator.getWorstCheckStatus
) extends ChecksSuiteBase {

  private val arbSingleDsChecks: Map[DescribedDs, Seq[ArbSingleDsCheck]] = singleDsChecks.map {
    case (dds, checks) =>
      val relevantChecks = checks.collect { case check: ArbSingleDsCheck => check }
      (dds, relevantChecks)
  }

  private val singleMetricChecks: Map[DescribedDs, Seq[SingleMetricCheck[_]]] = singleDsChecks.map {
    case (dds, checks) =>
      val relevantChecks = checks.collect { case check: SingleMetricCheck[_] => check }
      (dds, relevantChecks)
  }

  private val arbDualDsChecks: Map[DescribedDsPair, Seq[ArbDualDsCheck]] = dualDsChecks.map {
    case (ddsPair, checks) =>
      val relevantChecks = checks.collect { case check: ArbDualDsCheck => check }
      (ddsPair, relevantChecks)
  }

  private val dualMetricChecks: Map[DescribedDsPair, Seq[DualMetricCheck[_]]] = dualDsChecks.map {
    case (ddsPair, checks) =>
      val relevantChecks = checks.collect { case check: DualMetricCheck[_] => check }
      (ddsPair, relevantChecks)
  }

  /**
    * Run all checks in the ChecksSuite
    *
    * @param timestamp - time the checks are being run
    * @param ec        - execution context
    * @return
    */
  override def run(timestamp: Instant)(implicit ec: ExecutionContext): Future[ChecksSuiteResult] = {
    val metricBasedCheckResultsFut: Future[Seq[CheckResult]] = runMetricBasedChecks(timestamp)
    val singleDatasetCheckResults: Seq[CheckResult] = for {
      (dds, checks) <- arbSingleDsChecks.toSeq
      check <- checks
      checkResults = check.applyCheck(dds)
    } yield checkResults
    val datasetComparisonCheckResults: Seq[CheckResult] = for {
      (ddsPair, checks) <- arbDualDsChecks.toSeq
      check <- checks
      checkResults = check.applyCheck(ddsPair)
    } yield checkResults
    val arbitraryCheckResults = arbitraryChecks.map(_.applyCheck)

    for {
      metricBasedCheckResults <- metricBasedCheckResultsFut
      allCheckResults =
        metricBasedCheckResults ++ singleDatasetCheckResults ++ datasetComparisonCheckResults ++
          arbitraryCheckResults
      checkSuiteResult = ChecksSuiteResult(
        overallStatus = checkResultCombiner(allCheckResults),
        checkSuiteDescription = checkSuiteDescription,
        checkResults = allCheckResults,
        timestamp = timestamp,
        tags
      )
      _ <- qcResultsRepository.save(checkSuiteResult)
    } yield {
      checkSuiteResult
    }
  }

  /**
    * Calculates the minimum required metrics to calculate this check suite
    */
  private def getMinimumRequiredMetrics(
      seqSingleDatasetMetricsChecks: Map[DescribedDs, Seq[SingleMetricCheck[_]]],
      seqDualDatasetMetricChecks: Map[DescribedDsPair, Seq[DualMetricCheck[_]]],
      trackMetrics: Map[DescribedDs, Seq[MetricDescriptor]]
  ): Map[DescribedDs, List[MetricDescriptor]] = {
    val singleDatasetMetricDescriptors: Map[DescribedDs, List[MetricDescriptor]] = (for {
      (dds, checks) <- seqSingleDatasetMetricsChecks
      metricDescriptors = checks.map(_.metric).toList
    } yield (dds, metricDescriptors)).groupBy(_._1).mapValues(_.flatMap(_._2).toList)

    val dualDatasetAMetricDescriptors: Map[DescribedDs, List[MetricDescriptor]] = (for {
      (ddsPair, checks) <- seqDualDatasetMetricChecks
      describedDatasetA: DescribedDs = ddsPair.ds
      metricDescriptors = checks.map(_.dsMetric).toList
    } yield (describedDatasetA, metricDescriptors)).groupBy(_._1).mapValues(_.flatMap(_._2).toList)

    val dualDatasetBMetricDescriptors: Map[DescribedDs, List[MetricDescriptor]] = (for {
      (ddsPair, checks) <- seqDualDatasetMetricChecks
      describedDatasetB: DescribedDs = ddsPair.dsToCompare
      metricDescriptors = checks.map(_.dsToCompareMetric).toList
    } yield (describedDatasetB, metricDescriptors)).groupBy(_._1).mapValues(_.flatMap(_._2).toList)

    val allMetricDescriptors: Map[DescribedDs, List[MetricDescriptor]] =
      (singleDatasetMetricDescriptors |+| dualDatasetAMetricDescriptors |+| dualDatasetBMetricDescriptors
        |+| trackMetrics.mapValues(_.toList))
        .mapValues(_.distinct)

    allMetricDescriptors
  }

  private def runMetricBasedChecks(
      timestamp: Instant
  )(implicit ec: ExecutionContext): Future[Seq[CheckResult]] = {
    val allMetricDescriptors: Map[DescribedDs, List[MetricDescriptor]] =
      getMinimumRequiredMetrics(singleMetricChecks, dualMetricChecks, metricsToTrack)
    val calculatedMetrics: Map[DescribedDs, Either[MetricCalculationError, Map[MetricDescriptor, MetricValue]]] =
      allMetricDescriptors.map {
        case (describedDataset, metricDescriptors) =>
          val metricValues: Either[MetricCalculationError, Map[MetricDescriptor, MetricValue]] =
            MetricsCalculator.calculateMetrics(describedDataset, metricDescriptors)
          (describedDataset, metricValues)
      }

    val metricsToSave = calculatedMetrics.collect {
      case (describedDataset, Right(metrics)) =>
        (
          SingleDsDescription(describedDataset.description),
          metrics.map {
            case (descriptor, value) => (descriptor.toSimpleMetricDescriptor, value)
          }
        )
    }
    val storedMetricsFut = metricsPersister.save(timestamp, metricsToSave)

    for {
      _ <- storedMetricsFut
    } yield {
      val singleDatasetCheckResults: Seq[CheckResult] = singleMetricChecks.toSeq.flatMap {
        case (dds, checks) =>
          val datasetDescription = SingleDsDescription(dds.description)
          val maybeMetricsForDs: Either[MetricCalculationError, Map[MetricDescriptor, MetricValue]] = calculatedMetrics(dds)
          val checkResults: Seq[CheckResult] = checks.map { check =>
            maybeMetricsForDs match {
              case Left(err) => check.getMetricErrorCheckResult(dds.datasourceDescription, err)
              case Right(metricsForDs: Map[MetricDescriptor, MetricValue]) =>
                check.applyCheckOnMetrics(metricsForDs).withDatasourceDescription(datasetDescription)
            }
          }
          checkResults
      }

      val dualDatasetCheckResults: Seq[CheckResult] = dualMetricChecks.toSeq.flatMap {
        case (ddsPair, checks) =>
          val dds = ddsPair.ds
          val ddsToCompare = ddsPair.dsToCompare
          val maybeMetricsForDsA: Either[MetricCalculationError, Map[MetricDescriptor, MetricValue]] = calculatedMetrics(dds)
          val maybeMetricsForDsB: Either[MetricCalculationError, Map[MetricDescriptor, MetricValue]] = calculatedMetrics(ddsToCompare)
          val datasourceDescription = DualDsDescription(dds.description, ddsToCompare.description)
          val checkResults: Seq[CheckResult] = checks.map { check =>
            (maybeMetricsForDsA, maybeMetricsForDsB) match {
              case (Right(metricsForDsA), Right(metricsForDsB)) =>
                check
                  .applyCheckOnMetrics(metricsForDsA, metricsForDsB, datasourceDescription)
                  .withDatasourceDescription(datasourceDescription)
              case (Left(dsErr), Left(dsToCompareErr)) =>
                check.getMetricErrorCheckResult(ddsPair.datasourceDescription, dsErr, dsToCompareErr)
              case (_, Left(dsToCompareErr)) => check.getMetricErrorCheckResult(ddsPair.datasourceDescription, dsToCompareErr)
              case (Left(dsErr), _)          => check.getMetricErrorCheckResult(ddsPair.datasourceDescription, dsErr)
            }
          }
          checkResults
      }

      singleDatasetCheckResults ++ dualDatasetCheckResults
    }

  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy