com.sparkutils.quality.impl.RuleRunner.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of quality_9.1.dbr_3.1_2.12 Show documentation
Show all versions of quality_9.1.dbr_3.1_2.12 Show documentation
A Spark library for managing in-process data quality rules via Spark SQL
The newest version!
package com.sparkutils.quality.impl
import com.sparkutils.quality.impl.RuleLogicUtils.mapRules
import com.sparkutils.quality.impl.RuleRunnerUtils.flattenExpressions
import com.sparkutils.quality.impl.imports.RuleResultsImports.packId
import com.sparkutils.quality._
import types.ruleSuiteResultType
import com.sparkutils.quality.impl.imports.RuleRunnerImports
import com.sparkutils.quality.impl.util.{NonPassThrough, PassThrough}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode, ExprValue}
import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, UnaryExpression}
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.{Column, DataFrame, QualitySparkUtils}
import scala.reflect.ClassTag
object PackId {
def packId(anyId: Any) = {
val r = anyId.asInstanceOf[Id]
((r.id.toLong) << 32) | (r.version & 0xffffffffL)
}
def unpack(a: Any): Id =
if (a == null)
a.asInstanceOf[Id]
else
unpack(a.asInstanceOf[Long])
def unpack(a: Long) = {
val id = a >> 32
val version = a.toInt
Id(id.toInt, version) // lookup goes here
}
}
protected[quality] object RuleRunnerImpl {
/**
* Creates a column that runs the RuleSuite. This also forces registering the lambda functions used by that RuleSuite
*
* @param ruleSuite The Qualty RuleSuite to evaluate
* @param compileEvals Should the rules be compiled out to interim objects - by default true for eval usage, wholeStageCodeGen will evaluate in place
* @param resolveWith This experimental parameter can take the DataFrame these rules will be added to and pre-resolve and optimise the sql expressions, see the documentation for details on when to and not to use this. RuleRunner does not currently do wholestagecodegen when resolveWith is used.
* @param variablesPerFunc Defaulting to 40, it allows, in combination with variableFuncGroup customisation of handling the 64k jvm method size limitation when performing WholeStageCodeGen. You _shouldn't_ need it but it's there just in case.
* @param variableFuncGroup Defaulting to 20
* @param forceRunnerEval Defaulting to false, passing true forces a simplified partially interpreted evaluation (compileEvals must be false to get fully interpreted)
* @return A Column representing the Quality DQ expression built from this ruleSuite
*/
def ruleRunnerImpl(ruleSuite: RuleSuite, compileEvals: Boolean = true, resolveWith: Option[DataFrame] = None, variablesPerFunc: Int = 40, variableFuncGroup: Int = 20, forceRunnerEval: Boolean = false): Column = {
com.sparkutils.quality.registerLambdaFunctions( ruleSuite.lambdaFunctions )
val flattened = flattenExpressions(ruleSuite)
val runner = new RuleRunner(RuleLogicUtils.cleanExprs(ruleSuite), PassThrough(flattened), compileEvals, variablesPerFunc, variableFuncGroup, forceRunnerEval)
new Column(
QualitySparkUtils.resolveWithOverride(resolveWith).map { df =>
val resolved = QualitySparkUtils.resolveExpression(df, runner)
resolved.asInstanceOf[RuleRunner].copy(child = resolved.children(0) match {
// replace the expr
case PassThrough(children) => NonPassThrough(children)
})
} getOrElse runner
)
}
}
private[quality] object RuleRunnerUtils extends RuleRunnerImports {
def ruleResultToInt(ruleResult: RuleResult): Int =
ruleResult match {
case Failed => FailedInt
case SoftFailed => SoftFailedInt
case DisabledRule => DisabledRuleInt
case Passed => PassedInt
case Probability(percentage) => (percentage * PassedInt).toInt
case RuleResultWithProcessor(res, _) => ruleResultToInt(res)
}
def flattenExpressions(ruleSuite: RuleSuite): Seq[Expression] =
ruleSuite.ruleSets.flatMap(ruleSet => ruleSet.rules.map(rule =>
rule.expression match {
case r: ExprLogic => r.expr // only ExprLogic are possible here
}))
def reincorporateExpressions(ruleSuite: RuleSuite, expr: Seq[Expression], compileEvals: Boolean = true): RuleSuite =
reincorporateExpressionsF(ruleSuite, expr, (expr: Expression) => ExpressionWrapper(expr, compileEvals))
def reincorporateExpressionsF[T](ruleSuite: RuleSuite, expr: Seq[T], f: T => RuleLogic): RuleSuite = {
val itr = expr.iterator
mapRules(ruleSuite) { rule =>
rule.copy(expression = f(itr.next()))
}
}
case class RuleSuiteResultArray(packedId: Long, ruleSetIds: Array[Long], ruleSets: Array[Array[Long]]) {
}
// create base arrays for each maps
def ruleSuiteArrays(ruleSuite: RuleSuite): RuleSuiteResultArray = {
val (ruleSetIds, rulesArrays) =
ruleSuite.ruleSets.map{
ruleSet =>
packTheId(ruleSet.id) ->
ruleSet.rules.map(r => packTheId(r.id)).toArray
}.unzip // make sure no funkyness on ordering occurs
RuleSuiteResultArray(packTheId(ruleSuite.id),
ruleSetIds.toArray, rulesArrays.toArray)
}
def evalArray(ruleSuite: RuleSuite, ruleSuiteArrays: RuleSuiteResultArray, results: Array[Any]): InternalRow = {
import ruleSuite._
val ruleSetRes = Array.ofDim[InternalRow](ruleSuiteArrays.ruleSetIds.length)
var rsOverall = PassedInt
var offset = 0
for( rsi <- 0 until ruleSuiteArrays.ruleSetIds.length) {
val rulesetSize = ruleSuiteArrays.ruleSets(rsi).length
val ruleSetResults = results.slice(offset, offset + rulesetSize)
offset += rulesetSize
val overall = ruleSetResults.foldLeft(PassedInt){
(ov, res) =>
OverallResultHelper.inplaceInt(res.asInstanceOf[Int], ov, probablePass) // convert needed for process
}
rsOverall = OverallResultHelper.inplaceInt(overall, rsOverall, probablePass)
ruleSetRes(rsi) = InternalRow(
overall: java.lang.Integer,
ArrayBasedMapData( ruleSuiteArrays.ruleSets(rsi), ruleSetResults )
)
}
InternalRow( ruleSuiteArrays.packedId,
rsOverall: java.lang.Integer,
ArrayBasedMapData( ruleSuiteArrays.ruleSetIds, ruleSetRes)
)
}
def ruleResultToRow(ruleSuiteResult: RuleSuiteResult): InternalRow =
InternalRow(
packId(ruleSuiteResult.id),
ruleResultToInt(ruleSuiteResult.overallResult),
ArrayBasedMapData(
ruleSuiteResult.ruleSetResults, packId, (a: Any) => {
val v = a.asInstanceOf[RuleSetResult]
InternalRow(
ruleResultToInt(v.overallResult),
ArrayBasedMapData(
v.ruleResults, packId, (a: Any) => ruleResultToInt(a.asInstanceOf[RuleResult])
)
)
}
)
)
def packTheId(obj: Object) = packId(obj)//: java.lang.Long
protected[quality] def generateFunctionGroups(ctx: CodegenContext, allExpr: Iterator[Seq[String]]#GroupedIterator[Seq[String]], paramsDef: String, paramsCall: String) = {
val funNames =
for (exprGroup <- allExpr) yield {
val groupName = ctx.freshName("ruleRunnerEGroup")
ctx.addNewFunction(groupName, {
val funNames =
for {
exprFunc <- exprGroup
} yield {
val exprFuncName = ctx.freshName("ruleRunnerEFuncGroup")
ctx.addNewFunction(exprFuncName,
s"""
private void $exprFuncName($paramsDef) {
${exprFunc.mkString("\n")}
}
"""
)
}
s"""
private void $groupName($paramsDef) {
${funNames.map { f => s"$f($paramsCall);" }.mkString("\n")}
}
"""
})
}
funNames
}
def genRuleSuiteTerm[T: ClassTag](ctx: CodegenContext): (String, (String, String) => String) = {
val ruleSuiteClassName = classOf[RuleSuite].getName
val ruleRunnerClassName = implicitly[ClassTag[T]].runtimeClass.getName
val ruleRunnerExpressionIdx = ctx.references.size - 1
val ruleSuitTerm = ctx.addMutableState(ruleSuiteClassName, ctx.freshName("ruleSuite"),
v => s"$v = ($ruleSuiteClassName)((($ruleRunnerClassName)references" +
s"[$ruleRunnerExpressionIdx]).ruleSuite());")
val realChildrenTerm =
(funname: String, className: String) =>
ctx.addMutableState(className, ctx.freshName(funname),
v => s"$v = ($className)((($ruleRunnerClassName)references" +
s"[$ruleRunnerExpressionIdx]).$funname());")
(ruleSuitTerm, realChildrenTerm)
}
def nonOutputRuleGen(ctx: CodegenContext, ev: ExprCode, i: String, ruleSuitTerm: String, utilsName: String,
realChildren: Seq[Expression], variablesPerFunc: Int, variableFuncGroup: Int,
resultF: (ExprValue, Int) => String
): ExprCode = {
val ruleSuiteArrays = ctx.addMutableState(classOf[RuleSuiteResultArray].getName,
ctx.freshName("ruleSuiteArrays"),
v => s"$v = com.sparkutils.quality.impl.RuleRunnerUtils.ruleSuiteArrays($ruleSuitTerm);"
)
val ruleRes = "java.lang.Object"
val arrTerm = ctx.addMutableState(ruleRes + "[]", ctx.freshName("results"),
v => s"$v = new $ruleRes[${realChildren.size}];")
val allExpr = realChildren.zipWithIndex.map { case (child, idx) =>
val eval = child.genCode(ctx)
val converted =
s"""${eval.code}\n
$arrTerm[$idx] = ${eval.isNull} ? null : ${resultF(eval.value, idx)};"""
converted
}.grouped(variablesPerFunc).grouped(variableFuncGroup)
val (paramsDef, paramsCall) =
if (i ne null)
(s"InternalRow $i", s"$i")
else
(ctx.currentVars.map(v => s"${if (v.value.javaType.isPrimitive) v.value.javaType else v.value.javaType.getName} ${v.value}, ${v.isNull.javaType} ${v.isNull}").mkString(", "), ctx.currentVars.map(v => s"${v.value}, ${v.isNull}").mkString(", "))
val funNames: Iterator[String] =
RuleRunnerUtils.generateFunctionGroups(ctx, allExpr, paramsDef, paramsCall)
val res = ev.copy(code =
code"""
${funNames.map { f => s"$f($paramsCall);" }.mkString("\n")}
InternalRow ${ev.value} = $utilsName.evalArray($ruleSuitTerm, $ruleSuiteArrays, $arrTerm);
boolean ${ev.isNull} = false;
"""
)
res
}
}
/**
* Children will be rewritten by the plan, either by resolveWith, it's then re-incorporated into ruleSuite
*
* The variablePerFunc (40) and variableFuncGroup (20) parameters are chosen due to 64k method limitation,
* is null for all of a 1600 column row blasts the 64k jvm limit. You may customise these values if a given rulesuite
* requires it.
*
* @param variablesPerFunc How many variables are in a function
* @param variableFuncGroup How many functions are then grouped into a new function
*/
case class RuleRunner(ruleSuite: RuleSuite, child: Expression, compileEvals: Boolean,
variablesPerFunc: Int, variableFuncGroup: Int, forceRunnerEval: Boolean) extends UnaryExpression with NonSQLExpression with CodegenFallback {
import RuleRunnerUtils._
lazy val realChildren =
child match {
case r @ NonPassThrough(_) => r.rules
case PassThrough(children) => children
}
override def nullable: Boolean = false
override def toString: String = s"RuleRunner(${realChildren.mkString(", ")})"
// used only for eval, compiled uses the children directly
lazy val reincorporated = reincorporateExpressions(ruleSuite, realChildren, compileEvals)
// keep it simple for this one. - can return an internal row or whatever..
override def eval(input: InternalRow): Any = {
val res = RuleSuiteFunctions.eval(reincorporated, input)
ruleResultToRow(res)
}
def dataType: DataType = ruleSuiteResultType
/**
* Instead of evaluating through the RuleSuite structure the expressions are evaluated back to back, then
* reincorporated via just the results afterwards before working out the overall results
* @param ctx
* @param ev
* @return
*/
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
if (forceRunnerEval) {
return super[CodegenFallback].doGenCode(ctx, ev)
}
val i = ctx.INPUT_ROW
if (child.isInstanceOf[NonPassThrough] && (i eq null) ) {
// for some reason code gen ends up with assuming iterator based gen on children instead of simple gen - the actual gen isn't even called for flattenResultsTest, only for withResolve, could be dragons.
return super[CodegenFallback].doGenCode(ctx, ev)
}
ctx.references += this
// bind the rules
val ruleSuitTerm = genRuleSuiteTerm[RuleRunner](ctx)._1
val utilsName = "com.sparkutils.quality.impl.RuleRunnerUtils"
nonOutputRuleGen(ctx, ev, i, ruleSuitTerm, utilsName, realChildren, variablesPerFunc, variableFuncGroup,
(code: ExprValue, idx: Int) => s"com.sparkutils.quality.impl.RuleLogicUtils.anyToRuleResultInt($code)"
)
}
protected def withNewChildInternal(newChild: Expression): Expression = copy(child = newChild)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy