com.sparkutils.quality.impl.Validation.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of quality_3.4.1.oss_3.4_2.12 Show documentation
Show all versions of quality_3.4.1.oss_3.4_2.12 Show documentation
A Spark library for managing in-process data quality rules via Spark SQL
package com.sparkutils.quality.impl
import com.sparkutils.quality.impl.util.VariablesLookup.Identifiers
import com.sparkutils.quality.impl.util.RuleSuiteDocs.{IdTrEither, LambdaId, OutputExpressionId, RuleId}
import com.sparkutils.quality.impl.util.{Docs, DocsParser, ExpressionLookup, RuleSuiteDocs, VariablesLookup, WithDocs}
import com.sparkutils.quality._
import com.sparkutils.shim.ShowParams
import com.sparkutils.shim.expressions.Names.toName
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression, LambdaFunction => SLambdaFunction}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql._
import scala.collection.mutable
sealed trait RuleRelevant
sealed trait LambdaRelevant
sealed trait OutputExpressionRelevant
sealed trait HasId {
def id: Id
sealed trait HasOutputText {
def outputText: String
sealed trait HasNonIdText {
def nonIdText: String
* Base for RuleWarnings
sealed trait RuleWarning extends HasId with HasOutputText with HasNonIdText {
def warning: String
override def nonIdText: String = warning
* If the error is syntax based - defined by parsing, rather than any later stage
* @return
def syntax: Boolean = false
def warningText = s"$warning, occurred when processing id $id"
override def outputText: String = warningText
sealed trait SyntaxWarning extends RuleWarning {
final override def syntax: Boolean = true
sealed trait SyntaxNameWarning extends SyntaxWarning {
def name: String
case class LambdaPossibleSOE(id: Id) extends RuleWarning with LambdaRelevant {
val warning = "Possible SOE detected"
case class NonLambdaDocParameters(id: Id) extends SyntaxWarning {
val warning = "Parameter documentation is present on a non lambda expression"
case class ExtraDocParameter(id: Id, name: String) extends SyntaxNameWarning with LambdaRelevant {
val warning = s"Parameter $name is not found in the lambda expression"
* Base for RuleErrors
sealed trait RuleError extends HasId with HasOutputText with HasNonIdText {
def error: String
override def nonIdText: String = error
* If the error is syntax based - defined by parsing, rather than any later stage
* @return
def syntax: Boolean = false
def errorText = s"$error occurred when processing id $id"
override def outputText: String = errorText
sealed trait SyntaxError extends RuleError {
final override def syntax: Boolean = true
sealed trait NameMissingError extends RuleError {
def name: String
final override def error = s"Name $name is missing"
sealed trait ViewMissingError extends RuleError {
def name: String
final override def error = s"View $name is missing"
case class LambdaSyntaxError(id: Id, error: String) extends SyntaxError with LambdaRelevant
case class LambdaStackOverflowError(id: Id) extends SyntaxError with LambdaRelevant {
val error = "A lambda function seems to infinitely recurse"
case class LambdaNameError(name: String, id: Id) extends NameMissingError with LambdaRelevant
case class LambdaMultipleImplementationWithSameArityError(name: String, count: Int, argLength: Int, ids: Set[Id]) extends SyntaxError with LambdaRelevant {
val error = s"Lambda function $name has $count implementations with $argLength arguments"
val id = ids.head
case class LambdaViewError(name: String, id: Id) extends ViewMissingError with LambdaRelevant
case class RuleSyntaxError(id: Id, error: String) extends SyntaxError with RuleRelevant
case class RuleNameError(name: String, id: Id) extends NameMissingError with RuleRelevant
case class RuleViewError(name: String, id: Id) extends ViewMissingError with RuleRelevant
case class OutputRuleSyntaxError(id: Id, error: String) extends SyntaxError with OutputExpressionRelevant
case class OutputRuleNameError(name: String, id: Id) extends NameMissingError with OutputExpressionRelevant
case class OutputRuleViewError(name: String, id: Id) extends ViewMissingError with OutputExpressionRelevant
case class LambdaSparkFunctionNameError(name: String, id: Id) extends NameMissingError with LambdaRelevant
case class SparkFunctionNameError(name: String, id: Id) extends NameMissingError with RuleRelevant
case class OuputSparkFunctionNameError(name: String, id: Id) extends NameMissingError with OutputExpressionRelevant
case class DataFrameSyntaxError(error: String) extends SyntaxError {
val id = Validation.dataFrameSyntaxErrorId
object Validation {
val unknownSOEId = Id(Int.MinValue,Int.MinValue)
val dataFrameSyntaxErrorId = Id(Int.MinValue+1,Int.MinValue+1)
protected[quality] val defaultViewLookup: String => Boolean =
protected[sparkutils] val emptyDocs = Docs()
* For a given dataFrame provide a full set of any validation errors for a given ruleSuite.
* @param schemaOrFrame when it's a Left( StructType ) the struct will be used to test against and an emptyDataframe of this type created to resolve on the spark level. Using Right(DataFrame) will cause that dataframe to be used which is great for test cases with a runnerFunction
* @param ruleSuite
* @param runnerFunction - allows you to create a ruleRunner or ruleEngineRunner with different configurations
* @param showParams - configure how the output text is formatted using the same options and formatting as
* @param qualityName - the column name to store the runnerFunction results in
* @param recursiveLambdasSOEIsOk - this signals that finding a recursive lambda SOE should not stop the evaluations - if true it will still try to run any runnerFunction but may not give the correct results
* @param transformBeforeShow - an optional transformation function to help shape what results are pushed to show
* @param viewLookup - for any subquery used looks up the view name for being present (quoted and with schema), defaults to the current spark catalogue
* @return A set of errors and the output from the dataframe when a runnerFunction is specified
def validate(schemaOrFrame: Either[StructType, DataFrame], ruleSuite: RuleSuite, showParams: ShowParams = ShowParams(),
runnerFunction: Option[DataFrame => Column] = None, qualityName: String = "Quality",
recursiveLambdasSOEIsOk: Boolean = false, transformBeforeShow: DataFrame => DataFrame = identity, viewLookup: String => Boolean = Validation.defaultViewLookup):
(Set[RuleError], Set[RuleWarning], String, RuleSuiteDocs, Map[IdTrEither, ExpressionLookup]) = {
val schema = schemaOrFrame.fold(identity, _.schema)
val names = namesFromSchema(schema)
val docsWarnings = mutable.Set[RuleWarning]()
val ((lambdaSyntaxErrors, lambdaLookups, potentialOverflows, unknownLambdaSparkFunctionErrors, lambdaArityErrors, lambdaNameErrors, lambdas, lambdaDocWarnings, lambadaExpressionLookups, lambdaViewErrors)) =
validateLambdas(ruleSuite, recursiveLambdasSOEIsOk, names, viewLookup) match {
case Left(toReturn) => return toReturn
case Right(result) => result
docsWarnings ++= lambdaDocWarnings
val (ruleErrors, ruleDocWarnings, rules, outputExpressions, ruleExpressionLookups) = validateRules(ruleSuite, lambdaLookups, names, viewLookup)
docsWarnings ++= ruleDocWarnings
val (showOut, dfErrors) =
validateAgainstDataFrame(schemaOrFrame, showParams, runnerFunction, qualityName, transformBeforeShow, schema, viewLookup)
(unknownLambdaSparkFunctionErrors ++ lambdaArityErrors ++ dfErrors ++ ruleErrors ++ lambdaNameErrors ++ ++ lambdaViewErrors, LambdaPossibleSOE ) ++ (Set() ++ docsWarnings)
, showOut, RuleSuiteDocs(rules, outputExpressions, lambdas), lambadaExpressionLookups ++ ruleExpressionLookups)
protected def validateAgainstDataFrame(schemaOrFrame: Either[StructType, DataFrame], showParams: ShowParams, runnerFunction: Option[DataFrame => Column], qualityName: String, transformBeforeShow: DataFrame => DataFrame, schema: StructType, viewLookup: String => Boolean) = {
val basedf = schemaOrFrame.right.getOrElse {
val session =
val empty = session.sparkContext.emptyRDD[Row]
session.createDataFrame(empty, schema)
val (showOut, dfErrors) =
runnerFunction.fold(("", Set.empty[RuleError]))(rf => {
val runner = rf(basedf)
try {
val withRules = basedf.withColumn(qualityName, runner)
val transformed = transformBeforeShow(withRules)
(shim.utils.toString(transformed, showParams), Set.empty)
} catch {
case e: Throwable => ("", Set(DataFrameSyntaxError(e.getMessage)))
(showOut, dfErrors)
protected def validateRules(ruleSuite: RuleSuite, lambdaLookups: Map[String, Map[Id, Set[String]]], names: Set[String], viewLookup: String => Boolean)= {
val doRule = validateRule(lambdaLookups, names) _
var rules = Map.empty[Id, WithDocs[Rule]]
var outputExpressions = Map.empty[Id, WithDocs[RunOnPassProcessor]]
var exprLookups = Map.empty[IdTrEither, ExpressionLookup]
val docsWarnings = mutable.Set[RuleWarning]()
def addDocs[T](id: Id, rule: T, expressionRule: HasRuleText): (Id, WithDocs[T]) =
DocsParser.parse(expressionRule.rule).map { parseddocs =>
val res = id -> WithDocs(rule, parseddocs)
if (parseddocs.params.nonEmpty) {
docsWarnings += NonLambdaDocParameters(id)
}.getOrElse(id -> WithDocs(rule, emptyDocs))
// do the rules
val ruleErrors =
ruleSuite.ruleSets.flatMap { rs =>
rs.rules.flatMap { r =>
rules += addDocs(, r, r.expression.asInstanceOf[HasRuleText])
val (ruleErrors, exprLookup) = doRule(, r.expression.asInstanceOf[HasExpr].expr, false, viewLookup)
exprLookups += RuleId( -> exprLookup
val outputErrors =
if (r.runOnPassProcessor != NoOpRunOnPassProcessor.noOp) {
outputExpressions += addDocs(, r.runOnPassProcessor, r.runOnPassProcessor.returnIfPassed.asInstanceOf[OutputExpression])
val (oErrors, oExprLookup) = doRule(, r.runOnPassProcessor.returnIfPassed.expr, true, viewLookup)
exprLookups += OutputExpressionId( -> oExprLookup
} else
ruleErrors ++ outputErrors
(ruleErrors, Set() ++ docsWarnings, Map() ++ rules, outputExpressions, Map() ++ exprLookups)
protected def validateLambdas(ruleSuite: RuleSuite, recursiveLambdasSOEIsOk: Boolean, names: Set[String], viewLookup: String => Boolean): Either[(Set[RuleError], Set[RuleWarning], String, RuleSuiteDocs, Map[IdTrEither, ExpressionLookup]),
(Seq[(String, Either[(Id, Expression), LambdaSyntaxError])], Map[String, Map[Id, Set[String]]],
Set[Id], Set[LambdaSparkFunctionNameError], Set[LambdaMultipleImplementationWithSameArityError], Set[LambdaNameError], Map[Id, WithDocs[LambdaFunction]], Set[RuleWarning], Map[IdTrEither, ExpressionLookup], Set[LambdaViewError])] = {
var lambdas = Map.empty[Id, WithDocs[LambdaFunction]]
val docsWarnings = mutable.Set[RuleWarning]()
val viewErrors = ruleSuite.lambdaFunctions.flatMap { f =>
try {
subQueryErrors(viewLookup, f.expr, LambdaViewError(_,
} catch {
// Might be a parser error, skip to let the below code pick it up
case _: Throwable => Set.empty[LambdaViewError]
val (lambdaLeftExpressions, lambdaSyntaxErrors) = { f =>
try {
val expr = f.expr
val ret = Left((, expr))
val args =
expr match {
case lambda: SLambdaFunction =>
case _ => Set.empty[String]
DocsParser.parse(f.rule).map { parseddocs =>
lambdas += -> WithDocs(f, parseddocs)
parseddocs.params.keySet.foreach { name =>
if (!args.contains(name)) {
docsWarnings += ExtraDocParameter(, name)
}.getOrElse {
lambdas += -> WithDocs(f, emptyDocs)
} catch {
case e: Throwable => Right(LambdaSyntaxError(, e.getMessage))
}.partition {
val lambdaNameToExpressions = lambdaLeftExpressions.groupBy(p => p._1).mapValues(e =>
val (lambdaLookups, potentialOverflows, unknownLambdaSparkFunctions) = try {
} catch {
// SOE is possible with lambdas calling lambdas, capture that as a distinct issue
case soe: StackOverflowError =>
if (recursiveLambdasSOEIsOk)
// type needed otherwise it gets stuck with the first param type derivation _1 <: String instead of String
(Map.empty[String, Map[Id, Identifiers]], Set.empty[Id], Map.empty[Id, Set[String]])
return Left((Set(LambdaStackOverflowError(Validation.unknownSOEId)), Set.empty[RuleWarning], "", RuleSuiteDocs(), Map.empty[IdTrEither, ExpressionLookup]))
// now that they are looked up, a bit duplicative but...
val exprLookups = lambdaNameToExpressions.values.flatMap( m => => LambdaId(pair._1) -> VariablesLookup.fieldsFromExpression(pair._2, lambdaLookups))).toMap
val unknownLambdaSparkFunctionErrors = unknownLambdaSparkFunctions.flatMap(p => =>
LambdaSparkFunctionNameError(name, p._1))).toSet
val lambdaArityErrors = lambdaNameToExpressions.filter(p => p._2.size > 1).flatMap {
pairs =>
val map = pairs._2
val counts = map.groupBy(_._2.children.size - 1) // one child is the return
val moreThan1 = counts.collectFirst { case f if f._2.size > 1 => f } { f =>
LambdaMultipleImplementationWithSameArityError(pairs._1, f._2.size, f._1, f._2.keySet)
// do we have variables used in the lambdas which are not in the schema?
val lambdaNameErrors: Set[LambdaNameError] =
lambdaLookups.flatMap { p =>
p._2.flatMap { pair =>
val (id, identifiers) = pair
if (identifiers.diff(names).isEmpty)
Some(identifiers.diff(names).map(LambdaNameError(_, id)))
Right((lambdaSyntaxErrors, lambdaLookups, potentialOverflows, unknownLambdaSparkFunctionErrors, lambdaArityErrors, lambdaNameErrors, Map() ++ lambdas, Set() ++ docsWarnings, exprLookups, viewErrors))
protected def subQueryErrors[T](lookup: String => Boolean, expression: Expression, f: String => T): Set[T] = (expression collect {
case s: SubqueryExpression => s.plan.collect{
case rel: UnresolvedRelation if !lookup(rel.tableName) =>
protected def validateRule(lambdaLookups: Map[String, Map[Id, Set[String]]], names: Set[String])(id: Id, exprThunk: => Expression, outputRule: Boolean, viewLookup: String => Boolean): (Set[RuleError], ExpressionLookup) =
try {
val expr = exprThunk
val exl @ ExpressionLookup(exprFields, unknownSparkFunctions, _, _) = VariablesLookup.fieldsFromExpression(expr, lambdaLookups)
val rules = exprFields.flatMap{
field =>
if (names.contains(field))
if (!outputRule)
RuleNameError(field, id)
OutputRuleNameError(field, id)
val viewErrors = subQueryErrors(viewLookup, exprThunk, if (outputRule) OutputRuleViewError(_, id) else RuleViewError(_, id))
val unknown ={ name =>
if (!outputRule)
SparkFunctionNameError(name, id)
OuputSparkFunctionNameError(name, id)
(rules ++ unknown ++ viewErrors, exl)
} catch {
case e: Throwable => (Set(
if (!outputRule)
RuleSyntaxError(id, e.getMessage)
OutputRuleSyntaxError(id, e.getMessage)
), impl.util.ExpressionLookup())
© 2015 - 2025 Weber Informatics LLC | Privacy Policy