All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.timgent.sparkdataquality.metrics.MetricDescriptor.scala Maven / Gradle / Ivy

package com.github.timgent.sparkdataquality.metrics

import cats.Show
import com.github.timgent.sparkdataquality.metrics.MetricCalculator.{
  ComplianceMetricCalculator,
  DistinctValuesMetricCalculator,
  SizeMetricCalculator,
  SumValuesMetricCalculator
}
import com.github.timgent.sparkdataquality.metrics.MetricValue.NumericMetricValue

/**
  * Describes the metric being calculated
  */
private[sparkdataquality] trait MetricDescriptor {
  type MC <: MetricCalculator

  type MetricType = MC#MetricType

  /**
    * The metricCalculator which contains key logic for calculating a MetricValue for this MetricDescriptor
    * @return the MetricCalculator
    */
  def metricCalculator: MC

  /**
    * A representation of the MetricDescriptor that can be more easily handled for persistence
    * @return the SimpleMetricDescriptor
    */
  def toSimpleMetricDescriptor: SimpleMetricDescriptor

  /**
    * A name for the metric
    * @return
    */
  def metricName: String
}

object MetricDescriptor {

  /**
    * A MetricDescriptor which can have the dataset filtered before the metric is calculated
    */
  trait Filterable {

    /**
      * A filter to apply before calculation of the metric
      * @return the MetricFilter
      */
    def filter: MetricFilter
  }

  /**
    * A metric that calculates the number of rows in your dataset
    * @param filter - filter to be applied before the size is calculated
    */
  case class SizeMetric(filter: MetricFilter = MetricFilter.noFilter) extends MetricDescriptor with Filterable {
    override def metricCalculator: SizeMetricCalculator = SizeMetricCalculator(filter)
    override def toSimpleMetricDescriptor: SimpleMetricDescriptor =
      SimpleMetricDescriptor(metricName, Some(filter.filterDescription))
    override def metricName: String = "Size"
    override type MC = SizeMetricCalculator
  }

  /**
    * A metric that calculates the number of rows in your dataset
    * @param filter - filter to be applied before the size is calculated
    */
  case class SumValuesMetric[MV <: NumericMetricValue: MetricValueConstructor](
      onColumn: String,
      filter: MetricFilter = MetricFilter.noFilter
  ) extends MetricDescriptor
      with Filterable {
    override def metricCalculator: SumValuesMetricCalculator[MV] = SumValuesMetricCalculator[MV](onColumn, filter)
    override def toSimpleMetricDescriptor: SimpleMetricDescriptor =
      SimpleMetricDescriptor(metricName, Some(filter.filterDescription), onColumn = Some(onColumn))
    override def metricName: String = "SumValues"
    override type MC = SumValuesMetricCalculator[MV]
  }

  /**
    * A metric that calculates what fraction of rows comply with the given criteria
    * @param complianceFn - the criteria used to check each rows compliance
    * @param filter - a filter to be applied before the compliance fraction is calculated
    */
  case class ComplianceMetric(
      complianceFn: ComplianceFn,
      filter: MetricFilter = MetricFilter.noFilter
  ) extends MetricDescriptor
      with Filterable {
    override def metricCalculator: ComplianceMetricCalculator =
      ComplianceMetricCalculator(complianceFn, filter)
    override def toSimpleMetricDescriptor: SimpleMetricDescriptor =
      SimpleMetricDescriptor(
        metricName,
        Some(filter.filterDescription),
        Some(complianceFn.description)
      )
    override def metricName: String = "Compliance"
    override type MC = ComplianceMetricCalculator
  }

  /**
    * A metric that calculates the number of distinct values in a column or across several columns
    * @param onColumns - the columns for which you are counting distinct values
    * @param filter - the filter to be applied before the distinct count is calculated
    */
  case class CountDistinctValuesMetric(
      onColumns: List[String],
      filter: MetricFilter = MetricFilter.noFilter
  ) extends MetricDescriptor
      with Filterable {
    override def metricCalculator: DistinctValuesMetricCalculator =
      DistinctValuesMetricCalculator(onColumns, filter)
    override def toSimpleMetricDescriptor: SimpleMetricDescriptor =
      SimpleMetricDescriptor(
        metricName,
        Some(filter.filterDescription),
        onColumns = Some(onColumns)
      )
    override def metricName: String = "DistinctValues"
    override type MC = DistinctValuesMetricCalculator
  }
}

/**
  * Representation of a MetricDescriptor which is easy to persist
  */
private[sparkdataquality] case class SimpleMetricDescriptor(
    metricName: String,
    filterDescription: Option[String] = None,
    complianceDescription: Option[String] = None,
    onColumns: Option[List[String]] = None,
    onColumn: Option[String] = None
)

object SimpleMetricDescriptor {
  implicit val showSimpleMetricDescriptor: Show[SimpleMetricDescriptor] = Show.show { descriptor =>
    import descriptor._
    val filterDescriptionStr = filterDescription.map(filterDescription => s", filterDescription=$filterDescription").getOrElse("")
    val complianceDescriptionStr =
      complianceDescription.map(complianceDescription => s", complianceDescription=$complianceDescription").getOrElse("")
    val onColumnsStr = onColumns.map(onColumns => s", onColumns=$onColumns").getOrElse("")
    val onColumnStr = onColumn.map(onColumn => s", onColumn=$onColumn").getOrElse("")
    "metricName=" + metricName + filterDescriptionStr + complianceDescriptionStr + onColumnsStr + onColumnStr
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy