com.sparkutils.quality.impl.ReplaceWithIfMissingAttributesExpr.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of quality_3.2.1.oss_3.2_2.12 Show documentation
Show all versions of quality_3.2.1.oss_3.2_2.12 Show documentation
A Spark library for managing in-process data quality rules via Spark SQL
package com.sparkutils.quality.impl
import com.sparkutils.quality.impl.RuleLogicUtils.mapRules
import com.sparkutils.quality.impl.util.VariablesLookup.fieldsFromExpression
import com.sparkutils.quality.impl.imports.RuleResultsImports.DisabledRuleExpr
import com.sparkutils.quality.impl.imports.RuleRunnerImports
import com.sparkutils.quality.impl.util.LookupIdFunctions
import com.sparkutils.quality.{ExpressionRule, OutputExpression, Rule, RuleSuite}
import com.sparkutils.shim.expressions.Names.toName
import org.apache.spark.sql.catalyst.analysis.UnresolvedFunction
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, LambdaFunction => SparkLambdaFunction}
import org.apache.spark.sql.types.{NullType, StructType}
object ProcessDisableIfMissing extends RuleRunnerImports {
/**
* Processes a given RuleSuite to replace any coalesceIfMissingAttributes. This may be called before validate / docs but
* *must* be called *before* adding the expression to a dataframe.
*
* @param ruleSuite
* @param schema The names to validate against, if empty no attempt to process coalesceIfAttributeMissing will be made
* @return
*/
def processIfAttributeMissing(ruleSuite: RuleSuite, schema: StructType = StructType(Seq())) = {
val names = LookupIdFunctions.namesFromSchema(schema)
val lambdas = ruleSuite.lambdaFunctions.map(lf => ProcessDisableIfMissing.processIfMissingLambdaCoalesce(lf.parsed, names))
val wrs = ruleSuite.copy(lambdaFunctions = lambdas)
mapRules(wrs) {
ProcessDisableIfMissing.processCoalesceIfAttributeMissing(_, names)
}
}
/**
* For a given schema evaluates all calls to coalesce in a lambda to swap out missing attributes
*
* Must be called before any planning - otherwise Spark will already have rejected it.
*
* @param rule
* @return
*/
def processIfMissingLambdaCoalesce(rule: LambdaFunctionParsed, names: Set[String]): LambdaFunction =
rule.expr match {
case lambdaFunc: SparkLambdaFunction =>
lambdaFunc.function match {
case func: UnresolvedFunction =>
val rexp = processCoalesceIfAttributeMissing(func, names)
rule.copy(expr = lambdaFunc.copy(function = rexp))
case _ => rule
}
}
protected def isReplaceCoalesceName(unresolvedFunction: UnresolvedFunction) = {
val name = toName(unresolvedFunction)
name == "coalesceIfAttributesMissing" || name == "coalesceIfAttributesMissingDisable"
}
protected def isCoalesceDisabled(unresolvedFunction: UnresolvedFunction) = {
val name = toName(unresolvedFunction)
name == "coalesceIfAttributesMissingDisable"
}
/**
* Converts expression removing any coalesceIfAttributeMissing placeholding expressions using names to find each
* first expression with no missing names (or replacing with null should all of them be).
* @param expression
* @param names
* @return
*/
def processCoalesceIfAttributeMissing(expression: Expression, names: Set[String]): Expression =
expression match {
case funcExpr: UnresolvedFunction if isReplaceCoalesceName(funcExpr) =>
processCoalesceIfAttributeMissing(expression.children, names, isCoalesceDisabled(funcExpr))
case e: Expression =>
e.withNewChildren(e.children.map(processCoalesceIfAttributeMissing(_, names)))
}
/**
* Returns the first expression that has no missing names or null
* @param expressions
* @param names
* @return
*/
protected def processCoalesceIfAttributeMissing(expressions: Seq[Expression], names: Set[String], isDisabled: Boolean): Expression = {
val res =
expressions.flatMap{ e =>
// make sure no nested children are coalesce's
val r = processCoalesceIfAttributeMissing(e, names)
// get the names from this expression
val explookup = fieldsFromExpression(r)
// lookup each of them in the names and if any are not true we can't use this expression
if (!explookup.attributesUsed.map(names).forall(identity))
None
else
Some(r)
}
if (res.isEmpty) {
if (isDisabled) DisabledRuleExpr // special case as this is probably before the function registry gets called
else
Literal(null, NullType)
} else
res.head
}
/**
* Converts both rule expressions and any output expressions removing any coalesceIfAttributeMissing placeholding expressions using names to find each
* first expression with no missing names (or replacing with null should all of them be).
* @param names
* @return
*/
protected[quality] def processCoalesceIfAttributeMissing(rule: Rule, names: Set[String]): Rule =
rule match {
// rule and output
case Rule(id, ExpressionRule(rule: String),
iorule @ RunOnPassProcessorImpl(_, _, _, OutputExpression(oruleExpr: String))) if oruleExpr.nonEmpty =>
Rule(id, ExpressionRuleExpr(rule, processCoalesceIfAttributeMissing(RuleLogicUtils.expr(rule), names)),
iorule.copy(returnIfPassed = OutputExpressionExpr(oruleExpr,
processCoalesceIfAttributeMissing(RuleLogicUtils.expr(oruleExpr), names))))
// just a rule
case orule @ Rule(_, ExpressionRule(rule: String), _) =>
orule.copy( expression = ExpressionRuleExpr(rule, processCoalesceIfAttributeMissing(RuleLogicUtils.expr(rule), names)))
case _ => rule
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy