com.sparkutils.quality.impl.util.transformers.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of quality_2.4_2.11 Show documentation
Show all versions of quality_2.4_2.11 Show documentation
A Spark library for managing in-process data quality rules via Spark SQL
The newest version!
package com.sparkutils.quality.impl.util
import com.sparkutils.quality.{RuleSuite, ruleFolderRunner}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
protected[quality] object AddDataFunctions {
/**
* Leverages the foldRunner to replace fields, the input fields are used to create a structure that the rules fold over.
* The fields are then dropped from the original Dataframe and added back from the resulting structure.
*
* NOTE: The field order and types of the original DF will be maintained only when maintainOrder is true. As it requires access to the schema it may incur extra work.
*
* @param rules
* @param debugMode when true the last results are taken for the replaced fields
* @param maintainOrder when true the schema is used to replace fields in the correct location, when false they are simply appended
* @param useType In the case you must use select and can't use withColumn you may provide a type directly to stop the NPE
* @return
*/
def ifoldAndReplaceFields(rules: RuleSuite, fields: Seq[String], foldFieldName: String = "foldedFields", debugMode: Boolean = false,
tempFoldDebugName: String = "tempFOLDDEBUG",
maintainOrder: Boolean = true, useType: Option[StructType] = None): DataFrame => DataFrame = df => {
import org.apache.spark.sql.functions._
val theStruct = struct(fields.head, fields.tail :_*)
val withFolder = {
// select NPEs needs projection to work
//df.select(expr("*"), ruleFolderRunner(rules, theStruct).as(foldFieldName))
df.withColumn(foldFieldName, ruleFolderRunner(rules, theStruct, debugMode = debugMode, useType = useType) )
}
// create now as the schema will have the folder, which we may want to keep
val namesInOrder =
if (maintainOrder)
withFolder.schema.map(_.name)
else
Seq()
// lift the results
val result =
if (debugMode)
withFolder.drop(fields : _*).selectExpr("*",
s"element_at($foldFieldName.result, -1).result as $tempFoldDebugName"
).selectExpr("*", s"$tempFoldDebugName.*").drop(tempFoldDebugName)
else
withFolder.drop(fields : _*).selectExpr("*", s"$foldFieldName.result.*" )
// bring back to top level in the correct order
if (maintainOrder)
result.select(namesInOrder.head, namesInOrder.tail :_*)
else
result
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy