com.sparkutils.quality.impl.Validation.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of quality_2.4_2.11 Show documentation
Show all versions of quality_2.4_2.11 Show documentation
A Spark library for managing in-process data quality rules via Spark SQL
The newest version!
package com.sparkutils.quality.impl
import com.sparkutils.quality.impl.util.VariablesLookup.Identifiers
import com.sparkutils.quality.impl.util.RuleSuiteDocs.{IdTrEither, LambdaId, OutputExpressionId, RuleId}
import com.sparkutils.quality.impl.util.{Docs, DocsParser, ExpressionLookup, RuleSuiteDocs, VariablesLookup, WithDocs}
import com.sparkutils.quality._
import com.sparkutils.shim.ShowParams
import com.sparkutils.shim.expressions.Names.toName
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression, LambdaFunction => SLambdaFunction}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql._
import scala.collection.mutable
sealed trait RuleRelevant
sealed trait LambdaRelevant
sealed trait OutputExpressionRelevant
sealed trait HasId {
def id: Id
}
sealed trait HasOutputText {
def outputText: String
}
sealed trait HasNonIdText {
def nonIdText: String
}
/**
* Base for RuleWarnings
*/
sealed trait RuleWarning extends HasId with HasOutputText with HasNonIdText {
def warning: String
override def nonIdText: String = warning
/**
* If the error is syntax based - defined by parsing, rather than any later stage
* @return
*/
def syntax: Boolean = false
def warningText = s"$warning, occurred when processing id $id"
override def outputText: String = warningText
}
sealed trait SyntaxWarning extends RuleWarning {
final override def syntax: Boolean = true
}
sealed trait SyntaxNameWarning extends SyntaxWarning {
def name: String
}
case class LambdaPossibleSOE(id: Id) extends RuleWarning with LambdaRelevant {
val warning = "Possible SOE detected"
}
case class NonLambdaDocParameters(id: Id) extends SyntaxWarning {
val warning = "Parameter documentation is present on a non lambda expression"
}
case class ExtraDocParameter(id: Id, name: String) extends SyntaxNameWarning with LambdaRelevant {
val warning = s"Parameter $name is not found in the lambda expression"
}
/**
* Base for RuleErrors
*/
sealed trait RuleError extends HasId with HasOutputText with HasNonIdText {
def error: String
override def nonIdText: String = error
/**
* If the error is syntax based - defined by parsing, rather than any later stage
* @return
*/
def syntax: Boolean = false
def errorText = s"$error occurred when processing id $id"
override def outputText: String = errorText
}
sealed trait SyntaxError extends RuleError {
final override def syntax: Boolean = true
}
sealed trait NameMissingError extends RuleError {
def name: String
final override def error = s"Name $name is missing"
}
sealed trait ViewMissingError extends RuleError {
def name: String
final override def error = s"View $name is missing"
}
case class LambdaSyntaxError(id: Id, error: String) extends SyntaxError with LambdaRelevant
case class LambdaStackOverflowError(id: Id) extends SyntaxError with LambdaRelevant {
val error = "A lambda function seems to infinitely recurse"
}
case class LambdaNameError(name: String, id: Id) extends NameMissingError with LambdaRelevant
case class LambdaMultipleImplementationWithSameArityError(name: String, count: Int, argLength: Int, ids: Set[Id]) extends SyntaxError with LambdaRelevant {
val error = s"Lambda function $name has $count implementations with $argLength arguments"
val id = ids.head
}
case class LambdaViewError(name: String, id: Id) extends ViewMissingError with LambdaRelevant
case class RuleSyntaxError(id: Id, error: String) extends SyntaxError with RuleRelevant
case class RuleNameError(name: String, id: Id) extends NameMissingError with RuleRelevant
case class RuleViewError(name: String, id: Id) extends ViewMissingError with RuleRelevant
case class OutputRuleSyntaxError(id: Id, error: String) extends SyntaxError with OutputExpressionRelevant
case class OutputRuleNameError(name: String, id: Id) extends NameMissingError with OutputExpressionRelevant
case class OutputRuleViewError(name: String, id: Id) extends ViewMissingError with OutputExpressionRelevant
case class LambdaSparkFunctionNameError(name: String, id: Id) extends NameMissingError with LambdaRelevant
case class SparkFunctionNameError(name: String, id: Id) extends NameMissingError with RuleRelevant
case class OuputSparkFunctionNameError(name: String, id: Id) extends NameMissingError with OutputExpressionRelevant
case class DataFrameSyntaxError(error: String) extends SyntaxError {
val id = Validation.dataFrameSyntaxErrorId
}
object Validation {
val unknownSOEId = Id(Int.MinValue,Int.MinValue)
val dataFrameSyntaxErrorId = Id(Int.MinValue+1,Int.MinValue+1)
protected[quality] val defaultViewLookup: String => Boolean =
SparkSession.active.catalog.tableExists(_)
protected[sparkutils] val emptyDocs = Docs()
/**
* For a given dataFrame provide a full set of any validation errors for a given ruleSuite.
*
* @param schemaOrFrame when it's a Left( StructType ) the struct will be used to test against and an emptyDataframe of this type created to resolve on the spark level. Using Right(DataFrame) will cause that dataframe to be used which is great for test cases with a runnerFunction
* @param ruleSuite
* @param runnerFunction - allows you to create a ruleRunner or ruleEngineRunner with different configurations
* @param showParams - configure how the output text is formatted using the same options and formatting as dataFrame.show
* @param qualityName - the column name to store the runnerFunction results in
* @param recursiveLambdasSOEIsOk - this signals that finding a recursive lambda SOE should not stop the evaluations - if true it will still try to run any runnerFunction but may not give the correct results
* @param transformBeforeShow - an optional transformation function to help shape what results are pushed to show
* @param viewLookup - for any subquery used looks up the view name for being present (quoted and with schema), defaults to the current spark catalogue
* @return A set of errors and the output from the dataframe when a runnerFunction is specified
*/
def validate(schemaOrFrame: Either[StructType, DataFrame], ruleSuite: RuleSuite, showParams: ShowParams = ShowParams(),
runnerFunction: Option[DataFrame => Column] = None, qualityName: String = "Quality",
recursiveLambdasSOEIsOk: Boolean = false, transformBeforeShow: DataFrame => DataFrame = identity, viewLookup: String => Boolean = Validation.defaultViewLookup):
(Set[RuleError], Set[RuleWarning], String, RuleSuiteDocs, Map[IdTrEither, ExpressionLookup]) = {
val schema = schemaOrFrame.fold(identity, _.schema)
val names = namesFromSchema(schema)
val docsWarnings = mutable.Set[RuleWarning]()
val ((lambdaSyntaxErrors, lambdaLookups, potentialOverflows, unknownLambdaSparkFunctionErrors, lambdaArityErrors, lambdaNameErrors, lambdas, lambdaDocWarnings, lambadaExpressionLookups, lambdaViewErrors)) =
validateLambdas(ruleSuite, recursiveLambdasSOEIsOk, names, viewLookup) match {
case Left(toReturn) => return toReturn
case Right(result) => result
}
docsWarnings ++= lambdaDocWarnings
val (ruleErrors, ruleDocWarnings, rules, outputExpressions, ruleExpressionLookups) = validateRules(ruleSuite, lambdaLookups, names, viewLookup)
docsWarnings ++= ruleDocWarnings
val (showOut, dfErrors) =
validateAgainstDataFrame(schemaOrFrame, showParams, runnerFunction, qualityName, transformBeforeShow, schema, viewLookup)
(unknownLambdaSparkFunctionErrors ++ lambdaArityErrors ++ dfErrors ++ ruleErrors ++ lambdaNameErrors ++ lambdaSyntaxErrors.map(_._2.right.get).toSet ++ lambdaViewErrors,
potentialOverflows.map( LambdaPossibleSOE ) ++ (Set() ++ docsWarnings)
, showOut, RuleSuiteDocs(rules, outputExpressions, lambdas), lambadaExpressionLookups ++ ruleExpressionLookups)
}
protected def validateAgainstDataFrame(schemaOrFrame: Either[StructType, DataFrame], showParams: ShowParams, runnerFunction: Option[DataFrame => Column], qualityName: String, transformBeforeShow: DataFrame => DataFrame, schema: StructType, viewLookup: String => Boolean) = {
val basedf = schemaOrFrame.right.getOrElse {
val session = SparkSession.active
val empty = session.sparkContext.emptyRDD[Row]
session.createDataFrame(empty, schema)
}
val (showOut, dfErrors) =
runnerFunction.fold(("", Set.empty[RuleError]))(rf => {
val runner = rf(basedf)
try {
val withRules = basedf.withColumn(qualityName, runner)
val transformed = transformBeforeShow(withRules)
(shim.utils.toString(transformed, showParams), Set.empty)
} catch {
case e: Throwable => ("", Set(DataFrameSyntaxError(e.getMessage)))
}
})
(showOut, dfErrors)
}
protected def validateRules(ruleSuite: RuleSuite, lambdaLookups: Map[String, Map[Id, Set[String]]], names: Set[String], viewLookup: String => Boolean)= {
val doRule = validateRule(lambdaLookups, names) _
var rules = Map.empty[Id, WithDocs[Rule]]
var outputExpressions = Map.empty[Id, WithDocs[RunOnPassProcessor]]
var exprLookups = Map.empty[IdTrEither, ExpressionLookup]
val docsWarnings = mutable.Set[RuleWarning]()
def addDocs[T](id: Id, rule: T, expressionRule: HasRuleText): (Id, WithDocs[T]) =
DocsParser.parse(expressionRule.rule).map { parseddocs =>
val res = id -> WithDocs(rule, parseddocs)
if (parseddocs.params.nonEmpty) {
docsWarnings += NonLambdaDocParameters(id)
}
res
}.getOrElse(id -> WithDocs(rule, emptyDocs))
// do the rules
val ruleErrors =
ruleSuite.ruleSets.flatMap { rs =>
rs.rules.flatMap { r =>
rules += addDocs(r.id, r, r.expression.asInstanceOf[HasRuleText])
val (ruleErrors, exprLookup) = doRule(r.id, r.expression.asInstanceOf[HasExpr].expr, false, viewLookup)
exprLookups += RuleId(r.id) -> exprLookup
val outputErrors =
if (r.runOnPassProcessor != NoOpRunOnPassProcessor.noOp) {
outputExpressions += addDocs(r.runOnPassProcessor.id, r.runOnPassProcessor, r.runOnPassProcessor.returnIfPassed.asInstanceOf[OutputExpression])
val (oErrors, oExprLookup) = doRule(r.runOnPassProcessor.id, r.runOnPassProcessor.returnIfPassed.expr, true, viewLookup)
exprLookups += OutputExpressionId(r.runOnPassProcessor.id) -> oExprLookup
oErrors
} else
Set.empty
ruleErrors ++ outputErrors
}
}.toSet
(ruleErrors, Set() ++ docsWarnings, Map() ++ rules, outputExpressions, Map() ++ exprLookups)
}
protected def validateLambdas(ruleSuite: RuleSuite, recursiveLambdasSOEIsOk: Boolean, names: Set[String], viewLookup: String => Boolean): Either[(Set[RuleError], Set[RuleWarning], String, RuleSuiteDocs, Map[IdTrEither, ExpressionLookup]),
(Seq[(String, Either[(Id, Expression), LambdaSyntaxError])], Map[String, Map[Id, Set[String]]],
Set[Id], Set[LambdaSparkFunctionNameError], Set[LambdaMultipleImplementationWithSameArityError], Set[LambdaNameError], Map[Id, WithDocs[LambdaFunction]], Set[RuleWarning], Map[IdTrEither, ExpressionLookup], Set[LambdaViewError])] = {
var lambdas = Map.empty[Id, WithDocs[LambdaFunction]]
val docsWarnings = mutable.Set[RuleWarning]()
val viewErrors = ruleSuite.lambdaFunctions.flatMap { f =>
try {
subQueryErrors(viewLookup, f.expr, LambdaViewError(_, f.id))
} catch {
// Might be a parser error, skip to let the below code pick it up
case _: Throwable => Set.empty[LambdaViewError]
}
}.toSet
val (lambdaLeftExpressions, lambdaSyntaxErrors) = ruleSuite.lambdaFunctions.map { f =>
(f.name,
try {
val expr = f.expr
val ret = Left((f.id, expr))
val args =
expr match {
case lambda: SLambdaFunction => lambda.arguments.map(toName).toSet
case _ => Set.empty[String]
}
DocsParser.parse(f.rule).map { parseddocs =>
lambdas += f.id -> WithDocs(f, parseddocs)
parseddocs.params.keySet.foreach { name =>
if (!args.contains(name)) {
docsWarnings += ExtraDocParameter(f.id, name)
}
}
}.getOrElse {
lambdas += f.id -> WithDocs(f, emptyDocs)
}
ret
} catch {
case e: Throwable => Right(LambdaSyntaxError(f.id, e.getMessage))
})
}.partition {
_._2.isLeft
}
val lambdaNameToExpressions = lambdaLeftExpressions.groupBy(p => p._1).mapValues(e => e.map(_._2.left.get).toMap).toMap
val (lambdaLookups, potentialOverflows, unknownLambdaSparkFunctions) = try {
VariablesLookup.processLambdas(lambdaNameToExpressions)
} catch {
// SOE is possible with lambdas calling lambdas, capture that as a distinct issue
case soe: StackOverflowError =>
if (recursiveLambdasSOEIsOk)
// type needed otherwise it gets stuck with the first param type derivation _1 <: String instead of String
(Map.empty[String, Map[Id, Identifiers]], Set.empty[Id], Map.empty[Id, Set[String]])
else
return Left((Set(LambdaStackOverflowError(Validation.unknownSOEId)), Set.empty[RuleWarning], "", RuleSuiteDocs(), Map.empty[IdTrEither, ExpressionLookup]))
}
// now that they are looked up, a bit duplicative but...
val exprLookups = lambdaNameToExpressions.values.flatMap( m => m.map(pair => LambdaId(pair._1) -> VariablesLookup.fieldsFromExpression(pair._2, lambdaLookups))).toMap
val unknownLambdaSparkFunctionErrors = unknownLambdaSparkFunctions.flatMap(p => p._2.map(name =>
LambdaSparkFunctionNameError(name, p._1))).toSet
val lambdaArityErrors = lambdaNameToExpressions.filter(p => p._2.size > 1).flatMap {
pairs =>
val map = pairs._2
val counts = map.groupBy(_._2.children.size - 1) // one child is the return
val moreThan1 = counts.collectFirst { case f if f._2.size > 1 => f }
moreThan1.map { f =>
LambdaMultipleImplementationWithSameArityError(pairs._1, f._2.size, f._1, f._2.keySet)
}
}.toSet
// do we have variables used in the lambdas which are not in the schema?
val lambdaNameErrors: Set[LambdaNameError] =
lambdaLookups.flatMap { p =>
p._2.flatMap { pair =>
val (id, identifiers) = pair
if (identifiers.diff(names).isEmpty)
None
else
Some(identifiers.diff(names).map(LambdaNameError(_, id)))
}
}.flatten.toSet
Right((lambdaSyntaxErrors, lambdaLookups, potentialOverflows, unknownLambdaSparkFunctionErrors, lambdaArityErrors, lambdaNameErrors, Map() ++ lambdas, Set() ++ docsWarnings, exprLookups, viewErrors))
}
protected def subQueryErrors[T](lookup: String => Boolean, expression: Expression, f: String => T): Set[T] = (expression collect {
case s: SubqueryExpression => s.plan.collect{
case rel: UnresolvedRelation if !lookup(rel.tableName) =>
f(rel.tableName)
}
}).flatten.toSet
protected def validateRule(lambdaLookups: Map[String, Map[Id, Set[String]]], names: Set[String])(id: Id, exprThunk: => Expression, outputRule: Boolean, viewLookup: String => Boolean): (Set[RuleError], ExpressionLookup) =
try {
val expr = exprThunk
val exl @ ExpressionLookup(exprFields, unknownSparkFunctions, _, _) = VariablesLookup.fieldsFromExpression(expr, lambdaLookups)
val rules = exprFields.flatMap{
field =>
if (names.contains(field))
None
else
Some(
if (!outputRule)
RuleNameError(field, id)
else
OutputRuleNameError(field, id)
)
}.toSet[RuleError]
val viewErrors = subQueryErrors(viewLookup, exprThunk, if (outputRule) OutputRuleViewError(_, id) else RuleViewError(_, id))
val unknown = unknownSparkFunctions.map{ name =>
if (!outputRule)
SparkFunctionNameError(name, id)
else
OuputSparkFunctionNameError(name, id)
}
(rules ++ unknown ++ viewErrors, exl)
} catch {
case e: Throwable => (Set(
if (!outputRule)
RuleSyntaxError(id, e.getMessage)
else
OutputRuleSyntaxError(id, e.getMessage)
), impl.util.ExpressionLookup())
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy