
com.amazon.deequ.checks.Check.scala Maven / Gradle / Ivy
/**
* Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
package com.amazon.deequ.checks
import com.amazon.deequ.analyzers.runners.AnalyzerContext
import com.amazon.deequ.analyzers.Analyzer
import com.amazon.deequ.analyzers.AnalyzerOptions
import com.amazon.deequ.analyzers.DatasetMatchAnalyzer
import com.amazon.deequ.analyzers.DatasetMatchState
import com.amazon.deequ.analyzers.Histogram
import com.amazon.deequ.analyzers.KLLParameters
import com.amazon.deequ.analyzers.Patterns
import com.amazon.deequ.analyzers.State
import com.amazon.deequ.anomalydetection.HistoryUtils
import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategy
import com.amazon.deequ.anomalydetection.AnomalyDetector
import com.amazon.deequ.anomalydetection.DataPoint
import com.amazon.deequ.checks.ColumnCondition.isAnyNotNull
import com.amazon.deequ.checks.ColumnCondition.isEachNotNull
import com.amazon.deequ.constraints.Constraint._
import com.amazon.deequ.constraints._
import com.amazon.deequ.metrics.BucketDistribution
import com.amazon.deequ.metrics.Distribution
import com.amazon.deequ.metrics.Metric
import com.amazon.deequ.repository.MetricsRepository
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.expressions.UserDefinedFunction
import scala.util.matching.Regex
object CheckLevel extends Enumeration {
val Error, Warning = Value
}
object CheckStatus extends Enumeration {
val Success, Warning, Error = Value
}
case class CheckResult(
check: Check,
status: CheckStatus.Value,
constraintResults: Seq[ConstraintResult])
/**
* A class representing a list of constraints that can be applied to a given
* [[org.apache.spark.sql.DataFrame]]. In order to run the checks, use the `run` method. You can
* also use VerificationSuite.run to run your checks along with other Checks and Analysis objects.
* When run with VerificationSuite, Analyzers required by multiple checks/analysis blocks is
* optimized to run once.
*
* @param level Assertion level of the check group. If any of the constraints fail this
* level is used for the status of the check.
* @param description The name describes the check block. Generally will be used to show in
* the logs.
* @param constraints The constraints to apply when this check is run. New ones can be added
* and will return a new object
*/
case class Check(
level: CheckLevel.Value,
description: String,
private[deequ] val constraints: Seq[Constraint] = Seq.empty) {
/**
* Returns the name of the columns where each Constraint puts row-level results, if any
*
*/
def getRowLevelConstraintColumnNames(): Seq[String] = {
constraints.flatMap(c => {
c match {
case c: RowLevelConstraint => Some(c.getColumnName)
case _ => None
}
})
}
/**
* Returns a new Check object with the given constraint added to the constraints list.
*
* @param constraint New constraint to be added
* @return
*/
def addConstraint(constraint: Constraint): Check = {
Check(level, description, constraints :+ constraint)
}
/** Adds a constraint that can subsequently be replaced with a filtered version */
private[this] def addFilterableConstraint(
creationFunc: Option[String] => Constraint)
: CheckWithLastConstraintFilterable = {
val constraintWithoutFiltering = creationFunc(None)
CheckWithLastConstraintFilterable(level, description,
constraints :+ constraintWithoutFiltering, creationFunc)
}
/**
* Creates a constraint that calculates the data frame size and runs the assertion on it.
*
* @param assertion Function that receives a long input parameter and returns a boolean
* Assertion functions might refer to the data frame size by "_"
* .hasSize(_>5), meaning the number of rows should be greater than 5
* Or more elaborate function might be provided
* .hasSize{ aNameForSize => aNameForSize > 0 && aNameForSize < 10 }
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasSize(assertion: Long => Boolean, hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => Constraint.sizeConstraint(assertion, filter, hint) }
}
def hasColumnCount(assertion: Long => Boolean, hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint {
filter => Constraint.columnCountConstraint(assertion, hint)
}
}
/**
* Creates a constraint that asserts on a column completion.
*
* @param column Column to run the assertion on
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def isComplete(column: String, hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None): CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => completenessConstraint(column, Check.IsOne, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on a column completion.
* Uses the given history selection strategy to retrieve historical completeness values on this
* column from the history provider.
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasCompleteness(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => completenessConstraint(column, assertion, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on completion in combined set of columns.
*
* @param columns Columns to run the assertion on
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def areComplete(
columns: Seq[String],
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(isEachNotNull(columns), "Combined Completeness", Check.IsOne, hint, columns = columns.toList)
}
/**
* Creates a constraint that assert on completion in combined set of columns.
*
* @param columns Columns to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def haveCompleteness(
columns: Seq[String],
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(isEachNotNull(columns), "Combined Completeness", assertion, hint, columns = columns.toList)
}
/**
* Creates a constraint that asserts on completion in combined set of columns.
*
* @param columns Columns to run the assertion on
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def areAnyComplete(
columns: Seq[String],
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(isAnyNotNull(columns), "Any Completeness", Check.IsOne, hint, columns = columns.toList)
}
/**
* Creates a constraint that assert on completion in combined set of columns.
*
* @param columns Columns to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def haveAnyCompleteness(
columns: Seq[String],
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(isAnyNotNull(columns), "Any Completeness", assertion, hint, columns = columns.toList)
}
/**
* Creates a constraint that asserts on a column uniqueness.
*
* @param column Column to run the assertion on
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def isUnique(column: String, hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None): CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
uniquenessConstraint(Seq(column), Check.IsOne, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on a column(s) primary key characteristics.
* Currently only checks uniqueness, but reserved for primary key checks if there is another
* assertion to run on primary key columns.
*
* @param column Columns to run the assertion on
* @return
*/
def isPrimaryKey(column: String, columns: String*): CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
uniquenessConstraint(column :: columns.toList, Check.IsOne, filter) }
}
/**
* Creates a constraint that asserts on a column(s) primary key characteristics.
* Currently only checks uniqueness, but reserved for primary key checks if there is another
* assertion to run on primary key columns.
*
* @param column Columns to run the assertion on
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isPrimaryKey(column: String, hint: Option[String],
analyzerOptions: Option[AnalyzerOptions], columns: String*)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
uniquenessConstraint(column :: columns.toList, Check.IsOne, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on a column(s) primary key characteristics.
* Currently only checks uniqueness, but reserved for primary key checks if there is another
* assertion to run on primary key columns.
*
* @param column Columns to run the assertion on
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isPrimaryKey(column: String, hint: Option[String], columns: String*)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
uniquenessConstraint(column :: columns.toList, Check.IsOne, filter, hint)
}
}
/**
* Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
*
* @param columns Key columns
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of unique values
* @return
*/
def hasUniqueness(columns: Seq[String], assertion: Double => Boolean)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => uniquenessConstraint(columns, assertion, filter) }
}
/**
* Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
*
* @param columns Key columns
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of unique values
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasUniqueness(
columns: Seq[String],
assertion: Double => Boolean,
hint: Option[String])
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => uniquenessConstraint(columns, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
*
* @param columns Key columns
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of unique values
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasUniqueness(
columns: Seq[String],
assertion: Double => Boolean,
hint: Option[String],
analyzerOptions: Option[AnalyzerOptions])
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => uniquenessConstraint(columns, assertion, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on the uniqueness of a key column.
*
* @param column Key column
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of unique values.
* @return
*/
def hasUniqueness(column: String, assertion: Double => Boolean)
: CheckWithLastConstraintFilterable = {
hasUniqueness(Seq(column), assertion)
}
/**
* Creates a constraint that asserts on the uniqueness of a key column.
*
* @param column Key column
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of unique values.
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasUniqueness(column: String, assertion: Double => Boolean, hint: Option[String])
: CheckWithLastConstraintFilterable = {
hasUniqueness(Seq(column), assertion, hint)
}
/**
* Creates a constraint that asserts on the uniqueness of a key column.
*
* @param column Key column
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of unique values.
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasUniqueness(column: String, assertion: Double => Boolean, hint: Option[String],
analyzerOptions: Option[AnalyzerOptions])
: CheckWithLastConstraintFilterable = {
hasUniqueness(Seq(column), assertion, hint, analyzerOptions)
}
/**
* Creates a constraint on the distinctness in a single or combined set of key columns.
*
* @param columns columns
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of distinct values.
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasDistinctness(
columns: Seq[String], assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => distinctnessConstraint(columns, assertion, filter, hint) }
}
/**
* Creates a constraint on the unique value ratio in a single or combined set of key columns.
*
* @param columns columns
* @param assertion Function that receives a double input parameter and returns a boolean.
* Refers to the fraction of distinct values.
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasUniqueValueRatio(
columns: Seq[String],
assertion: Double => Boolean,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
uniqueValueRatioConstraint(columns, assertion, filter, hint, analyzerOptions) }
}
/**
* Performs a dataset check between the base DataFrame supplied to
* [[com.amazon.deequ.VerificationSuite.onData]] and other DataFrame supplied to this check using Deequ's
* [[com.amazon.deequ.comparison.DataSynchronization.columnMatch]] framework.
* This method compares specified columns of both DataFrames and assesses match based on a custom assertion.
*
* Utilizes [[com.amazon.deequ.analyzers.DatasetMatchAnalyzer]] for comparing the data
* and Constraint [[com.amazon.deequ.constraints.DatasetMatchConstraint]].
*
* Usage:
* To use this method, create a VerificationSuite and invoke this method as part of adding checks:
* {{{
* val baseDataFrame: DataFrame = ...
* val otherDataFrame: DataFrame = ...
* val columnMappings: Map[String, String] = Map("baseCol1" -> "otherCol1", "baseCol2" -> "otherCol2")
* val assertionFunction: Double => Boolean = _ > 0.7
*
* val check = new Check(CheckLevel.Error, "Data Synchronization Check")
* .doesDatasetMatch(otherDataFrame, columnMappings, assertionFunction)
*
* val verificationResult = VerificationSuite()
* .onData(baseDataFrame)
* .addCheck(check)
* .run()
* }}}
*
* This will add a dataset match check to the VerificationSuite, comparing the specified columns of
* baseDataFrame and otherDataFrame based on the provided assertion function.
*
* @param otherDataset The DataFrame to be compared with the current one. Analyzed in conjunction with the
* current DataFrame to assess data synchronization.
* @param keyColumnMappings A map defining the column correlations between the current DataFrame and otherDf.
* Keys represent column names in the current DataFrame, and values are corresponding
* column names in otherDf.
* @param assertion A function that takes a Double (result of the comparison) and returns a Boolean. Defines the
* condition under which the data in both DataFrames is considered synchronized. For example
* (_ > 0.7) denoting metric value > 0.7 or 70% of records.
* @param matchColumnMappings A map defining the column correlations between the current DataFrame and otherDf.
* These are the columns which we will check for equality, post joining. It's an optional
* value with defaults to None, which will be derived from `keyColumnMappings` if None.
* @param hint Optional. Additional context or information about the synchronization check.
* Helpful for understanding the intent or specifics of the check. Default is None.
* @return A [[com.amazon.deequ.checks.Check]] object representing the outcome of the dataset match check.
* This object can be used in Deequ's verification suite to assert data quality constraints.
*
*/
def doesDatasetMatch(otherDataset: DataFrame,
keyColumnMappings: Map[String, String],
assertion: Double => Boolean,
matchColumnMappings: Option[Map[String, String]] = None,
hint: Option[String] = None): Check = {
val dataMatchAnalyzer = DatasetMatchAnalyzer(otherDataset, keyColumnMappings, assertion, matchColumnMappings)
val constraint = AnalysisBasedConstraint[DatasetMatchState, Double, Double](dataMatchAnalyzer, assertion,
hint = hint)
addConstraint(constraint)
}
/**
* Creates a constraint that asserts on the number of distinct values a column has.
*
* @param column Column to run the assertion on
* @param assertion Function that receives a long input parameter and returns a boolean
* @param binningUdf An optional binning function
* @param maxBins Histogram details is only provided for N column values with top counts.
* maxBins sets the N
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasNumberOfDistinctValues(
column: String,
assertion: Long => Boolean,
binningUdf: Option[UserDefinedFunction] = None,
maxBins: Integer = Histogram.MaximumAllowedDetailBins,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
histogramBinConstraint(column, assertion, binningUdf, maxBins, filter, hint, computeFrequenciesAsRatio = false) }
}
/**
* Creates a constraint that asserts on column's value distribution.
*
* @param column Column to run the assertion on
* @param assertion Function that receives a Distribution input parameter and returns a boolean.
* E.g
* .hasHistogramValues("att2", _.absolutes("f") == 3)
* .hasHistogramValues("att2",
* _.ratios(Histogram.NullFieldReplacement) == 2/6.0)
* @param binningUdf An optional binning function
* @param maxBins Histogram details is only provided for N column values with top counts.
* maxBins sets the N
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasHistogramValues(
column: String,
assertion: Distribution => Boolean,
binningUdf: Option[UserDefinedFunction] = None,
maxBins: Integer = Histogram.MaximumAllowedDetailBins,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
histogramConstraint(column, assertion, binningUdf, maxBins, filter, hint) }
}
/**
* Creates a constraint that asserts on column's sketch size.
*
* @param column Column to run the assertion on
* @param assertion Function that receives a Distribution input parameter and returns a boolean.
* E.g
* .hasLargeKLLSketchSize("att2", _.parameters(1) >= 16,
* kllParameters = Option(kllParameters(2, 0.64, 2)))
* @param kllParameters parameters of KLL Sketch
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def kllSketchSatisfies(
column: String,
assertion: BucketDistribution => Boolean,
kllParameters: Option[KLLParameters] = None,
hint: Option[String] = None)
: Check = {
addConstraint(kllConstraint(column, assertion, kllParameters, hint))
}
/**
* Creates a constraint that runs AnomalyDetection on the new value
*
* @param metricsRepository A metrics repository to get the previous results
* @param anomalyDetectionStrategy The anomaly detection strategy
* @param analyzer The analyzer for the metric to run anomaly detection on
* @param withTagValues Can contain a Map with tag names and the corresponding values
* to filter for
* @param beforeDate The maximum dateTime of previous AnalysisResults to use for
* the Anomaly Detection
* @param afterDate The minimum dateTime of previous AnalysisResults to use for
* the Anomaly Detection
* @param hint A hint to provide additional context why a constraint
* could have failed
* @return
*/
private[deequ] def isNewestPointNonAnomalous[S <: State[S]](
metricsRepository: MetricsRepository,
anomalyDetectionStrategy: AnomalyDetectionStrategy,
analyzer: Analyzer[S, Metric[Double]],
withTagValues: Map[String, String],
afterDate: Option[Long],
beforeDate: Option[Long],
hint: Option[String] = None)
: Check = {
val anomalyAssertionFunction = Check.isNewestPointNonAnomalous(
metricsRepository,
anomalyDetectionStrategy,
analyzer,
withTagValues,
afterDate,
beforeDate
)(_)
addConstraint(anomalyConstraint(analyzer, anomalyAssertionFunction, hint))
}
/**
* Creates a constraint that asserts on a column entropy.
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasEntropy(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => entropyConstraint(column, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts on a mutual information between two columns.
*
* @param columnA First column for mutual information calculation
* @param columnB Second column for mutual information calculation
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasMutualInformation(
columnA: String,
columnB: String,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
mutualInformationConstraint(columnA, columnB, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts on an approximated quantile
*
* @param column Column to run the assertion on
* @param quantile Which quantile to assert on
* @param assertion Function that receives a double input parameter (the computed quantile)
* and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasApproxQuantile(column: String,
quantile: Double,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint( filter =>
approxQuantileConstraint(column, quantile, assertion, filter, hint))
}
/**
* Creates a constraint that asserts on an exact quantile
*
* @param column Column to run the assertion on
* @param quantile Which quantile to assert on
* @param assertion Function that receives a double input parameter (the computed quantile)
* and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasExactQuantile(column: String,
quantile: Double,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint(filter =>
exactQuantileConstraint(column, quantile, assertion, filter, hint))
}
/**
* Creates a constraint that asserts on the minimum length of the column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasMinLength(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => minLengthConstraint(column, assertion, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on the maximum length of the column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasMaxLength(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => maxLengthConstraint(column, assertion, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on the minimum of the column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasMin(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => minConstraint(column, assertion, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on the maximum of the column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasMax(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => maxConstraint(column, assertion, filter, hint, analyzerOptions) }
}
/**
* Creates a constraint that asserts on the mean of the column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasMean(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => meanConstraint(column, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts on the sum of the column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasSum(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter => sumConstraint(column, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts on the standard deviation of the column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasStandardDeviation(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
standardDeviationConstraint(column, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts on the approximate count distinct of the given column
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasApproxCountDistinct(
column: String,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
approxCountDistinctConstraint(column, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts on the pearson correlation between two columns.
*
* @param columnA First column for correlation calculation
* @param columnB Second column for correlation calculation
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasCorrelation(
columnA: String,
columnB: String,
assertion: Double => Boolean,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
correlationConstraint(columnA, columnB, assertion, filter, hint) }
}
/**
* Creates a constraint that runs the given condition on the data frame.
*
* @param columnCondition Data frame column which is a combination of expression and the column
* name. It has to comply with Spark SQL syntax.
* Can be written in an exact same way with conditions inside the
* `WHERE` clause.
* @param constraintName A name that summarizes the check being made. This name is being used to
* name the metrics for the analysis being done.
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def satisfies(
columnCondition: String,
constraintName: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None,
columns: List[String] = List.empty[String],
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
complianceConstraint(constraintName, columnCondition, assertion, filter, hint, columns, analyzerOptions)
}
}
/**
* Checks for pattern compliance. Given a column name and a regular expression, defines a
* Check on the average compliance of the column's values to the regular expression.
*
* @param column Name of the column that should be checked.
* @param pattern The columns values will be checked for a match against this pattern.
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def hasPattern(
column: String,
pattern: Regex,
assertion: Double => Boolean = Check.IsOne,
name: Option[String] = None,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
Constraint.patternMatchConstraint(column, pattern, assertion, filter, name, hint, analyzerOptions)
}
}
/**
* Check to run against the compliance of a column against a Credit Card pattern.
*
* @param column Name of the column that should be checked.
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def containsCreditCardNumber(
column: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
hasPattern(column, Patterns.CREDITCARD, assertion, Some(s"containsCreditCardNumber($column)"),
hint)
}
/**
* Check to run against the compliance of a column against an e-mail pattern.
*
* @param column Name of the column that should be checked.
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def containsEmail(
column: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
hasPattern(column, Patterns.EMAIL, assertion, Some(s"containsEmail($column)"), hint)
}
/**
* Check to run against the compliance of a column against an URL pattern.
*
* @param column Name of the column that should be checked.
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def containsURL(
column: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
hasPattern(column, Patterns.URL, assertion, Some(s"containsURL($column)"), hint)
}
/**
* Check to run against the compliance of a column against the Social security number pattern
* for the US.
*
* @param column Name of the column that should be checked.
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def containsSocialSecurityNumber(
column: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
hasPattern(column, Patterns.SOCIAL_SECURITY_NUMBER_US, assertion,
Some(s"containsSocialSecurityNumber($column)"), hint)
}
/**
* Check to run against the fraction of rows that conform to the given data type.
*
* @param column Name of the columns that should be checked.
* @param dataType Data type that the columns should be compared against.
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def hasDataType(
column: String,
dataType: ConstrainableDataTypes.Value,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
addFilterableConstraint { filter =>
Constraint.dataTypeConstraint(column, dataType, assertion, filter, hint) }
}
/**
* Creates a constraint that asserts that a column contains no negative values
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isNonNegative(
column: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(
// coalescing column to not count NULL values as non-compliant
// NOTE: cast to DECIMAL(20, 10) is needed to handle scientific notations
s"COALESCE(CAST($column AS DECIMAL(20,10)), 0.0) >= 0",
s"$column is non-negative",
assertion,
hint = hint,
columns = List(column)
)
}
/**
* Creates a constraint that asserts that a column contains no negative values
*
* @param column Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isPositive(
column: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
// coalescing column to not count NULL values as non-compliant
// NOTE: cast to DECIMAL(20, 10) is needed to handle scientific notations
satisfies(
s"COALESCE(CAST($column AS DECIMAL(20,10)), 1.0) > 0",
s"$column is positive",
assertion,
hint,
columns = List(column)
)
}
/**
*
* Asserts that, in each row, the value of columnA is less than the value of columnB
*
* @param columnA Column to run the assertion on
* @param columnB Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isLessThan(
columnA: String,
columnB: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(s"$columnA < $columnB", s"$columnA is less than $columnB", assertion,
hint = hint, columns = List(columnA, columnB))
}
/**
* Asserts that, in each row, the value of columnA is less than or equal to the value of columnB
*
* @param columnA Column to run the assertion on
* @param columnB Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isLessThanOrEqualTo(
columnA: String,
columnB: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(s"$columnA <= $columnB", s"$columnA is less than or equal to $columnB",
assertion, hint = hint, columns = List(columnA, columnB))
}
/**
* Asserts that, in each row, the value of columnA is greater than the value of columnB
*
* @param columnA Column to run the assertion on
* @param columnB Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isGreaterThan(
columnA: String,
columnB: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(s"$columnA > $columnB", s"$columnA is greater than $columnB",
assertion, hint = hint, columns = List(columnA, columnB))
}
/**
* Asserts that, in each row, the value of columnA is greather than or equal to the value of
* columnB
*
* @param columnA Column to run the assertion on
* @param columnB Column to run the assertion on
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isGreaterThanOrEqualTo(
columnA: String,
columnB: String,
assertion: Double => Boolean = Check.IsOne,
hint: Option[String] = None)
: CheckWithLastConstraintFilterable = {
satisfies(s"$columnA >= $columnB", s"$columnA is greater than or equal to $columnB",
assertion, hint = hint, columns = List(columnA, columnB))
}
// We can't use default values here as you can't combine default values and overloading in Scala
/**
* Asserts that every non-null value in a column is contained in a set of predefined values
*
* @param column Column to run the assertion on
* @param allowedValues allowed values for the column
* @return
*/
def isContainedIn(
column: String,
allowedValues: Array[String])
: CheckWithLastConstraintFilterable = {
isContainedIn(column, allowedValues, Check.IsOne, None, None)
}
// We can't use default values here as you can't combine default values and overloading in Scala
/**
* Asserts that every non-null value in a column is contained in a set of predefined values
*
* @param column Column to run the assertion on
* @param allowedValues allowed values for the column
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isContainedIn(
column: String,
allowedValues: Array[String],
hint: Option[String])
: CheckWithLastConstraintFilterable = {
isContainedIn(column, allowedValues, Check.IsOne, hint, None)
}
// We can't use default values here as you can't combine default values and overloading in Scala
/**
* Asserts that every non-null value in a column is contained in a set of predefined values
*
* @param column Column to run the assertion on
* @param allowedValues Allowed values for the column
* @param assertion Function that receives a double input parameter and returns a boolean
* @return
*/
def isContainedIn(
column: String,
allowedValues: Array[String],
assertion: Double => Boolean)
: CheckWithLastConstraintFilterable = {
isContainedIn(column, allowedValues, assertion, None, None)
}
// We can't use default values here as you can't combine default values and overloading in Scala
/**
* Asserts that every non-null value in a column is contained in a set of predefined values
*
* @param column Column to run the assertion on
* @param allowedValues Allowed values for the column
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @return
*/
def isContainedIn(
column: String,
allowedValues: Array[String],
assertion: Double => Boolean,
hint: Option[String])
: CheckWithLastConstraintFilterable = {
isContainedIn(column, allowedValues, assertion, hint, None)
}
// We can't use default values here as you can't combine default values and overloading in Scala
/**
* Asserts that every non-null value in a column is contained in a set of predefined values
*
* @param column Column to run the assertion on
* @param allowedValues Allowed values for the column
* @param assertion Function that receives a double input parameter and returns a boolean
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def isContainedIn(
column: String,
allowedValues: Array[String],
assertion: Double => Boolean,
hint: Option[String],
analyzerOptions: Option[AnalyzerOptions])
: CheckWithLastConstraintFilterable = {
val valueList = allowedValues
.map { _.replaceAll("'", "\\\\\'") }
.mkString("'", "','", "'")
val predicate = s"`$column` IS NULL OR `$column` IN ($valueList)"
satisfies(predicate, s"$column contained in ${allowedValues.mkString(",")}",
assertion, hint, List(column), analyzerOptions)
}
/**
* Asserts that the non-null values in a numeric column fall into the predefined interval
*
* @param column column to run the assertion
* @param lowerBound lower bound of the interval
* @param upperBound upper bound of the interval
* @param includeLowerBound is a value equal to the lower bound allows?
* @param includeUpperBound is a value equal to the upper bound allowed?
* @param hint A hint to provide additional context why a constraint could have failed
* @param analyzerOptions Options to configure analyzer behavior (NullTreatment, FilteredRow)
* @return
*/
def isContainedIn(
column: String,
lowerBound: Double,
upperBound: Double,
includeLowerBound: Boolean = true,
includeUpperBound: Boolean = true,
hint: Option[String] = None,
analyzerOptions: Option[AnalyzerOptions] = None)
: CheckWithLastConstraintFilterable = {
val leftOperand = if (includeLowerBound) ">=" else ">"
val rightOperand = if (includeUpperBound) "<=" else "<"
val predicate = s"`$column` IS NULL OR " +
s"(`$column` $leftOperand $lowerBound AND `$column` $rightOperand $upperBound)"
satisfies(predicate, s"$column between $lowerBound and $upperBound", hint = hint,
columns = List(column), analyzerOptions = analyzerOptions)
}
/**
* Evaluate this check on computed metrics
* @param context result of the metrics computation
* @return
*/
def evaluate(context: AnalyzerContext): CheckResult = {
val constraintResults = constraints.map { _.evaluate(context.metricMap) }
val anyFailures = constraintResults.exists { _.status == ConstraintStatus.Failure }
val checkStatus = (anyFailures, level) match {
case (true, CheckLevel.Error) => CheckStatus.Error
case (true, CheckLevel.Warning) => CheckStatus.Warning
case (_, _) => CheckStatus.Success
}
CheckResult(this, checkStatus, constraintResults)
}
def requiredAnalyzers(): Set[Analyzer[_, Metric[_]]] = {
constraints
.map {
case nc: ConstraintDecorator => nc.inner
case c: Constraint => c
}
.collect {
case constraint: AnalysisBasedConstraint[_, _, _] => constraint.analyzer
}
.map { _.asInstanceOf[Analyzer[_, Metric[_]]] }
.toSet
}
}
object Check {
/** A common assertion function checking if the value is 1 */
val IsOne: Double => Boolean = { _ == 1.0 }
def fromConstraint(constraint: Constraint,
description: String,
checkLevel: CheckLevel.Value = CheckLevel.Error): Check = {
Check(checkLevel, description, constraints = Seq(constraint))
}
/**
* Common assertion function checking if the value can be considered as normal (that no
* anomalies were detected), given the anomaly detection strategy and details on how to retrieve
* the history
*
* @param metricsRepository A metrics repository to get the previous results
* @param anomalyDetectionStrategy The anomaly detection strategy
* @param analyzer The analyzer for the metric to run anomaly detection on
* @param withTagValues Can contain a Map with tag names and the corresponding values
* to filter for
* @param beforeDate The maximum dateTime of previous AnalysisResults to use for
* the Anomaly Detection
* @param afterDate The minimum dateTime of previous AnalysisResults to use for
* the Anomaly Detection
* @param currentMetricValue current metric value
* @return
*/
private[deequ] def isNewestPointNonAnomalous[S <: State[S]](
metricsRepository: MetricsRepository,
anomalyDetectionStrategy: AnomalyDetectionStrategy,
analyzer: Analyzer[S, Metric[Double]],
withTagValues: Map[String, String],
afterDate: Option[Long],
beforeDate: Option[Long])(
currentMetricValue: Double)
: Boolean = {
// Get history keys
var repositoryLoader = metricsRepository.load()
repositoryLoader = repositoryLoader.withTagValues(withTagValues)
beforeDate.foreach { beforeDate =>
repositoryLoader = repositoryLoader.before(beforeDate) }
afterDate.foreach { afterDate =>
repositoryLoader = repositoryLoader.after(afterDate) }
repositoryLoader = repositoryLoader.forAnalyzers(Seq(analyzer))
val analysisResults = repositoryLoader.get()
require(analysisResults.nonEmpty, "There have to be previous results in the MetricsRepository!")
val historicalMetrics = analysisResults
// If we have multiple DataPoints with the same dateTime, which should not happen in most
// cases, we still want consistent behaviour, so we sort them by Tags first
// (sorting is stable in Scala)
.sortBy(_.resultKey.tags.values)
.map { analysisResult =>
val analyzerContextMetricMap = analysisResult.analyzerContext.metricMap
val onlyAnalyzerMetricEntryInLoadedAnalyzerContext = analyzerContextMetricMap.headOption
val doubleMetricOption = onlyAnalyzerMetricEntryInLoadedAnalyzerContext
.collect { case (_, metric) => metric.asInstanceOf[Metric[Double]] }
val dataSetDate = analysisResult.resultKey.dataSetDate
(dataSetDate, doubleMetricOption)
}
// Ensure this is the last dataPoint
val testDateTime = analysisResults.map(_.resultKey.dataSetDate).max + 1
require(testDateTime != Long.MaxValue, "Test DateTime cannot be Long.MaxValue, otherwise the" +
"Anomaly Detection, which works with an open upper interval bound, won't test anything")
// Run given anomaly detection strategy and return false if the newest value is an Anomaly
val anomalyDetector = AnomalyDetector(anomalyDetectionStrategy)
val detectedAnomalies = anomalyDetector.isNewPointAnomalous(
HistoryUtils.extractMetricValues[Double](historicalMetrics),
DataPoint(testDateTime, Some(currentMetricValue)))
detectedAnomalies.anomalies.isEmpty
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy