All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.amazon.deequ.checks.Check.scala Maven / Gradle / Ivy

Go to download

Deequ is a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets.

The newest version!
/**
 * Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 * use this file except in compliance with the License. A copy of the License
 * is located at
 *
 *     http://aws.amazon.com/apache2.0/
 *
 * or in the "license" file accompanying this file. This file is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 *
 */

package com.amazon.deequ.checks

import com.amazon.deequ.analyzers.runners.AnalyzerContext
import com.amazon.deequ.analyzers.Analyzer
import com.amazon.deequ.analyzers.AnalyzerOptions
import com.amazon.deequ.analyzers.DatasetMatchAnalyzer
import com.amazon.deequ.analyzers.DatasetMatchState
import com.amazon.deequ.analyzers.Histogram
import com.amazon.deequ.analyzers.KLLParameters
import com.amazon.deequ.analyzers.Patterns
import com.amazon.deequ.analyzers.State
import com.amazon.deequ.anomalydetection.HistoryUtils
import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategy
import com.amazon.deequ.anomalydetection.AnomalyDetector
import com.amazon.deequ.anomalydetection.DataPoint
import com.amazon.deequ.checks.ColumnCondition.isAnyNotNull
import com.amazon.deequ.checks.ColumnCondition.isEachNotNull
import com.amazon.deequ.constraints.Constraint._
import com.amazon.deequ.constraints._
import com.amazon.deequ.metrics.BucketDistribution
import com.amazon.deequ.metrics.Distribution
import com.amazon.deequ.metrics.Metric
import com.amazon.deequ.repository.MetricsRepository
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.expressions.UserDefinedFunction

import scala.util.matching.Regex

object CheckLevel extends Enumeration {
  val Error, Warning = Value
}

object CheckStatus extends Enumeration {
  val Success, Warning, Error = Value
}


case class CheckResult(
    check: Check,
    status: CheckStatus.Value,
    constraintResults: Seq[ConstraintResult])


/**
  * A class representing a list of constraints that can be applied to a given
  * [[org.apache.spark.sql.DataFrame]]. In order to run the checks, use the `run` method. You can
  * also use VerificationSuite.run to run your checks along with other Checks and Analysis objects.
  * When run with VerificationSuite, Analyzers required by multiple checks/analysis blocks is
  * optimized to run once.
  *
  * @param level           Assertion level of the check group. If any of the constraints fail this
  *                        level is used for the status of the check.
  * @param description     The name describes the check block. Generally will be used to show in
  *                        the logs.
  * @param constraints     The constraints to apply when this check is run. New ones can be added
  *                        and will return a new object
  */
case class Check(
  level: CheckLevel.Value,
  description: String,
  private[deequ] val constraints: Seq[Constraint] = Seq.empty) {

  /**
   * Returns the name of the columns where each Constraint puts row-level results, if any
   *
   */
  def getRowLevelConstraintColumnNames(): Seq[String] = {
    constraints.flatMap(c => {
      c match {
        case c: RowLevelConstraint => Some(c.getColumnName)
        case _ => None
      }
    })
  }

  /**
    * Returns a new Check object with the given constraint added to the constraints list.
    *
    * @param constraint New constraint to be added
    * @return
    */
  def addConstraint(constraint: Constraint): Check = {
    Check(level, description, constraints :+ constraint)
  }

  /** Adds a constraint that can subsequently be replaced with a filtered version */
  private[this] def addFilterableConstraint(
      creationFunc: Option[String] => Constraint)
    : CheckWithLastConstraintFilterable = {

    val constraintWithoutFiltering = creationFunc(None)

    CheckWithLastConstraintFilterable(level, description,
      constraints :+ constraintWithoutFiltering, creationFunc)
  }

  /**
    * Creates a constraint that calculates the data frame size and runs the assertion on it.
    *
    * @param assertion Function that receives a long input parameter and returns a boolean
    *                  Assertion functions might refer to the data frame size by "_"
    *                  .hasSize(_>5), meaning the number of rows should be greater than 5
    *                  Or more elaborate function might be provided
    *                  .hasSize{ aNameForSize => aNameForSize > 0 && aNameForSize < 10 }
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasSize(assertion: Long => Boolean, hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => Constraint.sizeConstraint(assertion, filter, hint) }
  }

  def hasColumnCount(assertion: Long => Boolean, hint: Option[String] = None)
  : CheckWithLastConstraintFilterable = {
    addFilterableConstraint {
      filter => Constraint.columnCountConstraint(assertion, hint)
    }
  }

  /**
    * Creates a constraint that asserts on a column completion.
    *
    * @param column Column to run the assertion on
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def isComplete(column: String, hint: Option[String] = None,
                 analyzerOptions: Option[AnalyzerOptions] = None): CheckWithLastConstraintFilterable = {
    addFilterableConstraint { filter => completenessConstraint(column, Check.IsOne, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on a column completion.
    * Uses the given history selection strategy to retrieve historical completeness values on this
    * column from the history provider.
    *
    * @param column    Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasCompleteness(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {
    addFilterableConstraint { filter => completenessConstraint(column, assertion, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on completion in combined set of columns.
    *
    * @param columns Columns to run the assertion on
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def areComplete(
      columns: Seq[String],
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {
    satisfies(isEachNotNull(columns), "Combined Completeness", Check.IsOne, hint, columns = columns.toList)
  }

  /**
    * Creates a constraint that assert on completion in combined set of columns.
    *
    * @param columns Columns to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def haveCompleteness(
      columns: Seq[String],
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {
    satisfies(isEachNotNull(columns), "Combined Completeness", assertion, hint, columns = columns.toList)
  }

  /**
   * Creates a constraint that asserts on completion in combined set of columns.
   *
   * @param columns Columns to run the assertion on
   * @param hint A hint to provide additional context why a constraint could have failed
   * @return
   */
  def areAnyComplete(
      columns: Seq[String],
      hint: Option[String] = None)
  : CheckWithLastConstraintFilterable = {
    satisfies(isAnyNotNull(columns), "Any Completeness", Check.IsOne, hint, columns = columns.toList)
  }

  /**
   * Creates a constraint that assert on completion in combined set of columns.
   *
   * @param columns Columns to run the assertion on
   * @param assertion Function that receives a double input parameter and returns a boolean
   * @param hint A hint to provide additional context why a constraint could have failed
   * @return
   */
  def haveAnyCompleteness(
      columns: Seq[String],
      assertion: Double => Boolean,
      hint: Option[String] = None)
  : CheckWithLastConstraintFilterable = {
    satisfies(isAnyNotNull(columns), "Any Completeness", assertion, hint, columns = columns.toList)
  }

  /**
    * Creates a constraint that asserts on a column uniqueness.
    *
    * @param column Column to run the assertion on
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def isUnique(column: String, hint: Option[String] = None,
               analyzerOptions: Option[AnalyzerOptions] = None): CheckWithLastConstraintFilterable = {
    addFilterableConstraint { filter =>
      uniquenessConstraint(Seq(column), Check.IsOne, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on a column(s) primary key characteristics.
    * Currently only checks uniqueness, but reserved for primary key checks if there is another
    * assertion to run on primary key columns.
    *
    * @param column Columns to run the assertion on
    * @return
    */
  def isPrimaryKey(column: String, columns: String*): CheckWithLastConstraintFilterable = {
    addFilterableConstraint { filter =>
      uniquenessConstraint(column :: columns.toList, Check.IsOne, filter) }
  }

  /**
    * Creates a constraint that asserts on a column(s) primary key characteristics.
    * Currently only checks uniqueness, but reserved for primary key checks if there is another
    * assertion to run on primary key columns.
    *
    * @param column Columns to run the assertion on
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isPrimaryKey(column: String, hint: Option[String],
                   analyzerOptions: Option[AnalyzerOptions], columns: String*)
    : CheckWithLastConstraintFilterable = {
    addFilterableConstraint { filter =>
      uniquenessConstraint(column :: columns.toList, Check.IsOne, filter, hint, analyzerOptions) }
  }

  /**
   * Creates a constraint that asserts on a column(s) primary key characteristics.
   * Currently only checks uniqueness, but reserved for primary key checks if there is another
   * assertion to run on primary key columns.
   *
   * @param column Columns to run the assertion on
   * @param hint   A hint to provide additional context why a constraint could have failed
   * @return
   */
  def isPrimaryKey(column: String, hint: Option[String], columns: String*)
  : CheckWithLastConstraintFilterable = {
    addFilterableConstraint { filter =>
      uniquenessConstraint(column :: columns.toList, Check.IsOne, filter, hint)
    }
  }

  /**
    * Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
    *
    * @param columns Key columns
    * @param assertion Function that receives a double input parameter and returns a boolean.
    *                  Refers to the fraction of unique values
    * @return
    */
  def hasUniqueness(columns: Seq[String], assertion: Double => Boolean)
    : CheckWithLastConstraintFilterable = {
    addFilterableConstraint { filter => uniquenessConstraint(columns, assertion, filter) }
  }

  /**
   * Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
   *
   * @param columns         Key columns
   * @param assertion       Function that receives a double input parameter and returns a boolean.
   *                        Refers to the fraction of unique values
   * @param hint            A hint to provide additional context why a constraint could have failed
   * @return
   */
  def hasUniqueness(
                     columns: Seq[String],
                     assertion: Double => Boolean,
                     hint: Option[String])
  : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => uniquenessConstraint(columns, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
    *
    * @param columns Key columns
    * @param assertion Function that receives a double input parameter and returns a boolean.
    *                  Refers to the fraction of unique values
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasUniqueness(
      columns: Seq[String],
      assertion: Double => Boolean,
      hint: Option[String],
      analyzerOptions: Option[AnalyzerOptions])
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => uniquenessConstraint(columns, assertion, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on the uniqueness of a key column.
    *
    * @param column Key column
    * @param assertion Function that receives a double input parameter and returns a boolean.
    *                  Refers to the fraction of unique values.
    * @return
    */
  def hasUniqueness(column: String, assertion: Double => Boolean)
    : CheckWithLastConstraintFilterable = {
    hasUniqueness(Seq(column), assertion)
  }

  /**
    * Creates a constraint that asserts on the uniqueness of a key column.
    *
    * @param column Key column
    * @param assertion Function that receives a double input parameter and returns a boolean.
    *                  Refers to the fraction of unique values.
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasUniqueness(column: String, assertion: Double => Boolean, hint: Option[String])
    : CheckWithLastConstraintFilterable = {
    hasUniqueness(Seq(column), assertion, hint)
  }

  /**
   * Creates a constraint that asserts on the uniqueness of a key column.
   *
   * @param column          Key column
   * @param assertion       Function that receives a double input parameter and returns a boolean.
   *                        Refers to the fraction of unique values.
   * @param hint            A hint to provide additional context why a constraint could have failed
   * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
   * @return
   */
  def hasUniqueness(column: String, assertion: Double => Boolean, hint: Option[String],
                    analyzerOptions: Option[AnalyzerOptions])
  : CheckWithLastConstraintFilterable = {
    hasUniqueness(Seq(column), assertion, hint, analyzerOptions)
  }

  /**
    * Creates a constraint on the distinctness in a single or combined set of key columns.
    *
    * @param columns columns
    * @param assertion Function that receives a double input parameter and returns a boolean.
    *                  Refers to the fraction of distinct values.
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasDistinctness(
      columns: Seq[String], assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => distinctnessConstraint(columns, assertion, filter, hint) }
  }

  /**
    * Creates a constraint on the unique value ratio in a single or combined set of key columns.
    *
    * @param columns columns
    * @param assertion Function that receives a double input parameter and returns a boolean.
    *                  Refers to the fraction of distinct values.
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasUniqueValueRatio(
      columns: Seq[String],
      assertion: Double => Boolean,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      uniqueValueRatioConstraint(columns, assertion, filter, hint, analyzerOptions) }
  }

  /**
   * Performs a dataset check between the base DataFrame supplied to
   * [[com.amazon.deequ.VerificationSuite.onData]] and other DataFrame supplied to this check using Deequ's
   * [[com.amazon.deequ.comparison.DataSynchronization.columnMatch]] framework.
   * This method compares specified columns of both DataFrames and assesses match based on a custom assertion.
   *
   * Utilizes [[com.amazon.deequ.analyzers.DatasetMatchAnalyzer]] for comparing the data
   * and Constraint [[com.amazon.deequ.constraints.DatasetMatchConstraint]].
   *
   * Usage:
   * To use this method, create a VerificationSuite and invoke this method as part of adding checks:
   * {{{
   *   val baseDataFrame: DataFrame = ...
   *   val otherDataFrame: DataFrame = ...
   *   val columnMappings: Map[String, String] = Map("baseCol1" -> "otherCol1", "baseCol2" -> "otherCol2")
   *   val assertionFunction: Double => Boolean = _ > 0.7
   *
   *   val check = new Check(CheckLevel.Error, "Data Synchronization Check")
   *     .doesDatasetMatch(otherDataFrame, columnMappings, assertionFunction)
   *
   *   val verificationResult = VerificationSuite()
   *     .onData(baseDataFrame)
   *     .addCheck(check)
   *     .run()
   * }}}
   *
   * This will add a dataset match check to the VerificationSuite, comparing the specified columns of
   * baseDataFrame and otherDataFrame based on the provided assertion function.
   *
   * @param otherDataset The DataFrame to be compared with the current one. Analyzed in conjunction with the
   *                     current DataFrame to assess data synchronization.
   * @param keyColumnMappings  A map defining the column correlations between the current DataFrame and otherDf.
   *                           Keys represent column names in the current DataFrame, and values are corresponding
   *                           column names in otherDf.
   * @param assertion A function that takes a Double (result of the comparison) and returns a Boolean. Defines the
   *                  condition under which the data in both DataFrames is considered synchronized. For example
   *                  (_ > 0.7) denoting metric value > 0.7 or 70% of records.
   * @param matchColumnMappings A map defining the column correlations between the current DataFrame and otherDf.
   *                            These are the columns which we will check for equality, post joining. It's an optional
   *                            value with defaults to None, which will be derived from `keyColumnMappings` if None.
   * @param hint Optional. Additional context or information about the synchronization check.
   *             Helpful for understanding the intent or specifics of the check. Default is None.
   * @return A [[com.amazon.deequ.checks.Check]] object representing the outcome of the dataset match check.
   *         This object can be used in Deequ's verification suite to assert data quality constraints.
   *
   */
  def doesDatasetMatch(otherDataset: DataFrame,
                       keyColumnMappings: Map[String, String],
                       assertion: Double => Boolean,
                       matchColumnMappings: Option[Map[String, String]] = None,
                       hint: Option[String] = None): Check = {
    val dataMatchAnalyzer = DatasetMatchAnalyzer(otherDataset, keyColumnMappings, assertion, matchColumnMappings)
    val constraint = AnalysisBasedConstraint[DatasetMatchState, Double, Double](dataMatchAnalyzer, assertion,
      hint = hint)
    addConstraint(constraint)
  }

  /**
    * Creates a constraint that asserts on the number of distinct values a column has.
    *
    * @param column     Column to run the assertion on
    * @param assertion  Function that receives a long input parameter and returns a boolean
    * @param binningUdf An optional binning function
    * @param maxBins    Histogram details is only provided for N column values with top counts.
    *                   maxBins sets the N
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasNumberOfDistinctValues(
      column: String,
      assertion: Long => Boolean,
      binningUdf: Option[UserDefinedFunction] = None,
      maxBins: Integer = Histogram.MaximumAllowedDetailBins,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      histogramBinConstraint(column, assertion, binningUdf, maxBins, filter, hint, computeFrequenciesAsRatio = false) }
  }

  /**
    * Creates a constraint that asserts on column's value distribution.
    *
    * @param column     Column to run the assertion on
    * @param assertion  Function that receives a Distribution input parameter and returns a boolean.
    *                   E.g
    *                   .hasHistogramValues("att2", _.absolutes("f") == 3)
    *                   .hasHistogramValues("att2",
    *                   _.ratios(Histogram.NullFieldReplacement) == 2/6.0)
    * @param binningUdf An optional binning function
    * @param maxBins    Histogram details is only provided for N column values with top counts.
    *                   maxBins sets the N
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasHistogramValues(
      column: String,
      assertion: Distribution => Boolean,
      binningUdf: Option[UserDefinedFunction] = None,
      maxBins: Integer = Histogram.MaximumAllowedDetailBins,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      histogramConstraint(column, assertion, binningUdf, maxBins, filter, hint) }
  }

  /**
   * Creates a constraint that asserts on column's sketch size.
   *
   * @param column    Column to run the assertion on
   * @param assertion Function that receives a Distribution input parameter and returns a boolean.
   *                  E.g
   *                  .hasLargeKLLSketchSize("att2", _.parameters(1) >= 16,
   *                  kllParameters = Option(kllParameters(2, 0.64, 2)))
   * @param kllParameters parameters of KLL Sketch
   * @param hint A hint to provide additional context why a constraint could have failed
   * @return
   */
  def kllSketchSatisfies(
                          column: String,
                          assertion: BucketDistribution => Boolean,
                          kllParameters: Option[KLLParameters] = None,
                          hint: Option[String] = None)
    : Check = {

    addConstraint(kllConstraint(column, assertion, kllParameters, hint))
  }

  /**
    * Creates a constraint that runs AnomalyDetection on the new value
    *
    * @param metricsRepository        A metrics repository to get the previous results
    * @param anomalyDetectionStrategy The anomaly detection strategy
    * @param analyzer                 The analyzer for the metric to run anomaly detection on
    * @param withTagValues            Can contain a Map with tag names and the corresponding values
    *                                 to filter for
    * @param beforeDate               The maximum dateTime of previous AnalysisResults to use for
    *                                 the Anomaly Detection
    * @param afterDate                The minimum dateTime of previous AnalysisResults to use for
    *                                 the Anomaly Detection
    * @param hint                     A hint to provide additional context why a constraint
    *                                 could have failed
    * @return
    */
  private[deequ] def isNewestPointNonAnomalous[S <: State[S]](
      metricsRepository: MetricsRepository,
      anomalyDetectionStrategy: AnomalyDetectionStrategy,
      analyzer: Analyzer[S, Metric[Double]],
      withTagValues: Map[String, String],
      afterDate: Option[Long],
      beforeDate: Option[Long],
      hint: Option[String] = None)
    : Check = {

    val anomalyAssertionFunction = Check.isNewestPointNonAnomalous(
      metricsRepository,
      anomalyDetectionStrategy,
      analyzer,
      withTagValues,
      afterDate,
      beforeDate
    )(_)

    addConstraint(anomalyConstraint(analyzer, anomalyAssertionFunction, hint))
  }


  /**
    * Creates a constraint that asserts on a column entropy.
    *
    * @param column    Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint      A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasEntropy(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => entropyConstraint(column, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts on a mutual information between two columns.
    *
    * @param columnA   First column for mutual information calculation
    * @param columnB   Second column for mutual information calculation
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint      A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasMutualInformation(
      columnA: String,
      columnB: String,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      mutualInformationConstraint(columnA, columnB, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts on an approximated quantile
    *
    * @param column Column to run the assertion on
    * @param quantile Which quantile to assert on
    * @param assertion Function that receives a double input parameter (the computed quantile)
    *                  and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasApproxQuantile(column: String,
      quantile: Double,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint( filter =>
      approxQuantileConstraint(column, quantile, assertion, filter, hint))
  }

  /**
   * Creates a constraint that asserts on an exact quantile
   *
   * @param column    Column to run the assertion on
   * @param quantile  Which quantile to assert on
   * @param assertion Function that receives a double input parameter (the computed quantile)
   *                  and returns a boolean
   * @param hint      A hint to provide additional context why a constraint could have failed
   * @return
   */
  def hasExactQuantile(column: String,
                        quantile: Double,
                        assertion: Double => Boolean,
                        hint: Option[String] = None)
  : CheckWithLastConstraintFilterable = {

    addFilterableConstraint(filter =>
      exactQuantileConstraint(column, quantile, assertion, filter, hint))
  }

  /**
    * Creates a constraint that asserts on the minimum length of the column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasMinLength(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => minLengthConstraint(column, assertion, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on the maximum length of the column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasMaxLength(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => maxLengthConstraint(column, assertion, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on the minimum of the column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasMin(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => minConstraint(column, assertion, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on the maximum of the column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasMax(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => maxConstraint(column, assertion, filter, hint, analyzerOptions) }
  }

  /**
    * Creates a constraint that asserts on the mean of the column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasMean(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => meanConstraint(column, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts on the sum of the column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasSum(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter => sumConstraint(column, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts on the standard deviation of the column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasStandardDeviation(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      standardDeviationConstraint(column, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts on the approximate count distinct of the given column
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasApproxCountDistinct(
      column: String,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      approxCountDistinctConstraint(column, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts on the pearson correlation between two columns.
    *
    * @param columnA   First column for correlation calculation
    * @param columnB   Second column for correlation calculation
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasCorrelation(
      columnA: String,
      columnB: String,
      assertion: Double => Boolean,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      correlationConstraint(columnA, columnB, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that runs the given condition on the data frame.
    *
    * @param columnCondition Data frame column which is a combination of expression and the column
    *                        name. It has to comply with Spark SQL syntax.
    *                        Can be written in an exact same way with conditions inside the
    *                        `WHERE` clause.
    * @param constraintName  A name that summarizes the check being made. This name is being used to
    *                        name the metrics for the analysis being done.
    * @param assertion       Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def satisfies(
      columnCondition: String,
      constraintName: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None,
      columns: List[String] = List.empty[String],
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      complianceConstraint(constraintName, columnCondition, assertion, filter, hint, columns, analyzerOptions)
    }
  }

  /**
    * Checks for pattern compliance. Given a column name and a regular expression, defines a
    * Check on the average compliance of the column's values to the regular expression.
    *
    * @param column Name of the column that should be checked.
    * @param pattern The columns values will be checked for a match against this pattern.
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def hasPattern(
      column: String,
      pattern: Regex,
      assertion: Double => Boolean = Check.IsOne,
      name: Option[String] = None,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      Constraint.patternMatchConstraint(column, pattern, assertion, filter, name, hint, analyzerOptions)
    }
  }

  /**
    * Check to run against the compliance of a column against a Credit Card pattern.
    *
    * @param column Name of the column that should be checked.
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def containsCreditCardNumber(
      column: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    hasPattern(column, Patterns.CREDITCARD, assertion, Some(s"containsCreditCardNumber($column)"),
      hint)
  }

  /**
    * Check to run against the compliance of a column against an e-mail pattern.
    *
    * @param column Name of the column that should be checked.
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def containsEmail(
      column: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    hasPattern(column, Patterns.EMAIL, assertion, Some(s"containsEmail($column)"), hint)
  }

  /**
    * Check to run against the compliance of a column against an URL pattern.
    *
    * @param column Name of the column that should be checked.
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def containsURL(
      column: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    hasPattern(column, Patterns.URL, assertion, Some(s"containsURL($column)"), hint)
  }

  /**
    * Check to run against the compliance of a column against the Social security number pattern
    * for the US.
    *
    * @param column Name of the column that should be checked.
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def containsSocialSecurityNumber(
      column: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    hasPattern(column, Patterns.SOCIAL_SECURITY_NUMBER_US, assertion,
      Some(s"containsSocialSecurityNumber($column)"), hint)
  }

  /**
    * Check to run against the fraction of rows that conform to the given data type.
    *
    * @param column Name of the columns that should be checked.
    * @param dataType Data type that the columns should be compared against.
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def hasDataType(
      column: String,
      dataType: ConstrainableDataTypes.Value,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    addFilterableConstraint { filter =>
      Constraint.dataTypeConstraint(column, dataType, assertion, filter, hint) }
  }

  /**
    * Creates a constraint that asserts that a column contains no negative values
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isNonNegative(
      column: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    satisfies(
      // coalescing column to not count NULL values as non-compliant
      // NOTE: cast to DECIMAL(20, 10) is needed to handle scientific notations
      s"COALESCE(CAST($column AS DECIMAL(20,10)), 0.0) >= 0",
      s"$column is non-negative",
      assertion,
      hint = hint,
      columns = List(column)
    )
  }

  /**
    * Creates a constraint that asserts that a column contains no negative values
    *
    * @param column Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isPositive(
      column: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {
    // coalescing column to not count NULL values as non-compliant
    // NOTE: cast to DECIMAL(20, 10) is needed to handle scientific notations
    satisfies(
      s"COALESCE(CAST($column AS DECIMAL(20,10)), 1.0) > 0",
      s"$column is positive",
      assertion,
      hint,
      columns = List(column)
    )
  }

  /**
    *
    * Asserts that, in each row, the value of columnA is less than the value of columnB
    *
    * @param columnA Column to run the assertion on
    * @param columnB Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isLessThan(
      columnA: String,
      columnB: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    satisfies(s"$columnA < $columnB", s"$columnA is less than $columnB", assertion,
      hint = hint, columns = List(columnA, columnB))
  }

  /**
    * Asserts that, in each row, the value of columnA is less than or equal to the value of columnB
    *
    * @param columnA Column to run the assertion on
    * @param columnB Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isLessThanOrEqualTo(
      columnA: String,
      columnB: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    satisfies(s"$columnA <= $columnB", s"$columnA is less than or equal to $columnB",
      assertion, hint = hint, columns = List(columnA, columnB))
  }

  /**
    * Asserts that, in each row, the value of columnA is greater than the value of columnB
    *
    * @param columnA Column to run the assertion on
    * @param columnB Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isGreaterThan(
      columnA: String,
      columnB: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    satisfies(s"$columnA > $columnB", s"$columnA is greater than $columnB",
      assertion, hint = hint, columns = List(columnA, columnB))
  }

  /**
    * Asserts that, in each row, the value of columnA is greather than or equal to the value of
    * columnB
    *
    * @param columnA Column to run the assertion on
    * @param columnB Column to run the assertion on
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isGreaterThanOrEqualTo(
      columnA: String,
      columnB: String,
      assertion: Double => Boolean = Check.IsOne,
      hint: Option[String] = None)
    : CheckWithLastConstraintFilterable = {

    satisfies(s"$columnA >= $columnB", s"$columnA is greater than or equal to $columnB",
      assertion, hint = hint, columns = List(columnA, columnB))
  }

  // We can't use default values here as you can't combine default values and overloading in Scala
  /**
    * Asserts that every non-null value in a column is contained in a set of predefined values
    *
    * @param column Column to run the assertion on
    * @param allowedValues allowed values for the column
    * @return
    */
  def isContainedIn(
      column: String,
      allowedValues: Array[String])
    : CheckWithLastConstraintFilterable = {

    isContainedIn(column, allowedValues, Check.IsOne, None, None)
  }

  // We can't use default values here as you can't combine default values and overloading in Scala
  /**
    * Asserts that every non-null value in a column is contained in a set of predefined values
    *
    * @param column Column to run the assertion on
    * @param allowedValues allowed values for the column
    * @param hint A hint to provide additional context why a constraint could have failed
    * @return
    */
  def isContainedIn(
      column: String,
      allowedValues: Array[String],
      hint: Option[String])
    : CheckWithLastConstraintFilterable = {

    isContainedIn(column, allowedValues, Check.IsOne, hint, None)
  }

  // We can't use default values here as you can't combine default values and overloading in Scala
  /**
    * Asserts that every non-null value in a column is contained in a set of predefined values
    *
    * @param column Column to run the assertion on
    * @param allowedValues Allowed values for the column
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @return
    */
  def isContainedIn(
      column: String,
      allowedValues: Array[String],
      assertion: Double => Boolean)
    : CheckWithLastConstraintFilterable = {

    isContainedIn(column, allowedValues, assertion, None, None)
  }

  // We can't use default values here as you can't combine default values and overloading in Scala
  /**
   * Asserts that every non-null value in a column is contained in a set of predefined values
   *
   * @param column        Column to run the assertion on
   * @param allowedValues Allowed values for the column
   * @param assertion     Function that receives a double input parameter and returns a boolean
   * @param hint A hint to provide additional context why a constraint could have failed
   * @return
   */
  def isContainedIn(
                     column: String,
                     allowedValues: Array[String],
                     assertion: Double => Boolean,
                     hint: Option[String])
  : CheckWithLastConstraintFilterable = {

    isContainedIn(column, allowedValues, assertion, hint, None)
  }

  // We can't use default values here as you can't combine default values and overloading in Scala
  /**
    * Asserts that every non-null value in a column is contained in a set of predefined values
    *
    * @param column Column to run the assertion on
    * @param allowedValues Allowed values for the column
    * @param assertion Function that receives a double input parameter and returns a boolean
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def isContainedIn(
      column: String,
      allowedValues: Array[String],
      assertion: Double => Boolean,
      hint: Option[String],
      analyzerOptions: Option[AnalyzerOptions])
    : CheckWithLastConstraintFilterable = {

    val valueList = allowedValues
      .map { _.replaceAll("'", "\\\\\'") }
      .mkString("'", "','", "'")

    val predicate = s"`$column` IS NULL OR `$column` IN ($valueList)"
    satisfies(predicate, s"$column contained in ${allowedValues.mkString(",")}",
      assertion, hint, List(column), analyzerOptions)
  }

  /**
    * Asserts that the non-null values in a numeric column fall into the predefined interval
    *
    * @param column column to run the assertion
    * @param lowerBound lower bound of the interval
    * @param upperBound upper bound of the interval
    * @param includeLowerBound is a value equal to the lower bound allows?
    * @param includeUpperBound is a value equal to the upper bound allowed?
    * @param hint A hint to provide additional context why a constraint could have failed
    * @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
    * @return
    */
  def isContainedIn(
      column: String,
      lowerBound: Double,
      upperBound: Double,
      includeLowerBound: Boolean = true,
      includeUpperBound: Boolean = true,
      hint: Option[String] = None,
      analyzerOptions: Option[AnalyzerOptions] = None)
    : CheckWithLastConstraintFilterable = {

    val leftOperand = if (includeLowerBound) ">=" else ">"
    val rightOperand = if (includeUpperBound) "<=" else "<"

    val predicate = s"`$column` IS NULL OR " +
      s"(`$column` $leftOperand $lowerBound AND `$column` $rightOperand $upperBound)"

    satisfies(predicate, s"$column between $lowerBound and $upperBound", hint = hint,
      columns = List(column), analyzerOptions = analyzerOptions)
  }

  /**
    * Evaluate this check on computed metrics
    * @param context result of the metrics computation
    * @return
    */
  def evaluate(context: AnalyzerContext): CheckResult = {

    val constraintResults = constraints.map { _.evaluate(context.metricMap) }
    val anyFailures = constraintResults.exists { _.status == ConstraintStatus.Failure }

    val checkStatus = (anyFailures, level) match {
      case (true, CheckLevel.Error) => CheckStatus.Error
      case (true, CheckLevel.Warning) => CheckStatus.Warning
      case (_, _) => CheckStatus.Success
    }

    CheckResult(this, checkStatus, constraintResults)
  }

  def requiredAnalyzers(): Set[Analyzer[_, Metric[_]]] = {
    constraints
      .map {
        case nc: ConstraintDecorator => nc.inner
        case c: Constraint => c
      }
      .collect {
        case constraint: AnalysisBasedConstraint[_, _, _] => constraint.analyzer
      }
      .map { _.asInstanceOf[Analyzer[_, Metric[_]]] }
      .toSet
  }
}

object Check {

  /** A common assertion function checking if the value is 1 */
  val IsOne: Double => Boolean = { _ == 1.0 }

  def fromConstraint(constraint: Constraint,
                     description: String,
                     checkLevel: CheckLevel.Value = CheckLevel.Error): Check = {
    Check(checkLevel, description, constraints = Seq(constraint))
  }

  /**
    * Common assertion function checking if the value can be considered as normal (that no
    * anomalies were detected), given the anomaly detection strategy and details on how to retrieve
    * the history
    *
    * @param metricsRepository        A metrics repository to get the previous results
    * @param anomalyDetectionStrategy The anomaly detection strategy
    * @param analyzer                 The analyzer for the metric to run anomaly detection on
    * @param withTagValues            Can contain a Map with tag names and the corresponding values
    *                                 to filter for
    * @param beforeDate               The maximum dateTime of previous AnalysisResults to use for
    *                                 the Anomaly Detection
    * @param afterDate                The minimum dateTime of previous AnalysisResults to use for
    *                                 the Anomaly Detection
    * @param currentMetricValue       current metric value
    * @return
    */
  private[deequ] def isNewestPointNonAnomalous[S <: State[S]](
      metricsRepository: MetricsRepository,
      anomalyDetectionStrategy: AnomalyDetectionStrategy,
      analyzer: Analyzer[S, Metric[Double]],
      withTagValues: Map[String, String],
      afterDate: Option[Long],
      beforeDate: Option[Long])(
      currentMetricValue: Double)
    : Boolean = {

    // Get history keys
    var repositoryLoader = metricsRepository.load()

    repositoryLoader = repositoryLoader.withTagValues(withTagValues)

    beforeDate.foreach { beforeDate =>
      repositoryLoader = repositoryLoader.before(beforeDate) }

    afterDate.foreach { afterDate =>
      repositoryLoader = repositoryLoader.after(afterDate) }

    repositoryLoader = repositoryLoader.forAnalyzers(Seq(analyzer))

    val analysisResults = repositoryLoader.get()

    require(analysisResults.nonEmpty, "There have to be previous results in the MetricsRepository!")

    val historicalMetrics = analysisResults
      // If we have multiple DataPoints with the same dateTime, which should not happen in most
      // cases, we still want consistent behaviour, so we sort them by Tags first
      // (sorting is stable in Scala)
      .sortBy(_.resultKey.tags.values)
      .map { analysisResult =>
        val analyzerContextMetricMap = analysisResult.analyzerContext.metricMap

        val onlyAnalyzerMetricEntryInLoadedAnalyzerContext = analyzerContextMetricMap.headOption

        val doubleMetricOption = onlyAnalyzerMetricEntryInLoadedAnalyzerContext
          .collect { case (_, metric) => metric.asInstanceOf[Metric[Double]] }

        val dataSetDate = analysisResult.resultKey.dataSetDate

        (dataSetDate, doubleMetricOption)
      }

    // Ensure this is the last dataPoint
    val testDateTime = analysisResults.map(_.resultKey.dataSetDate).max + 1
    require(testDateTime != Long.MaxValue, "Test DateTime cannot be Long.MaxValue, otherwise the" +
        "Anomaly Detection, which works with an open upper interval bound, won't test anything")

    // Run given anomaly detection strategy and return false if the newest value is an Anomaly
    val anomalyDetector = AnomalyDetector(anomalyDetectionStrategy)
    val detectedAnomalies = anomalyDetector.isNewPointAnomalous(
      HistoryUtils.extractMetricValues[Double](historicalMetrics),
      DataPoint(testDateTime, Some(currentMetricValue)))

    detectedAnomalies.anomalies.isEmpty
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy