com.sparkutils.quality.impl.util.ConfigLoader.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of quality_2.4_2.11 Show documentation
Show all versions of quality_2.4_2.11 Show documentation
A Spark library for managing in-process data quality rules via Spark SQL
The newest version!
package com.sparkutils.quality.impl.util
import com.sparkutils.quality.{DataFrameLoader, Id}
import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Dataset, Encoder, functions}
/**
* Represents a configuration row for view loading
*
* @param name the view name, this will be used to manage dependencies
* @param source either a loaded DataFrame or an sql to run against the catalog
*/
class Config(val name: String, val source: Either[DataFrame, String])
/**
* Underlying row information converted into a ViewConfig with the following logic:
*
* a) if token is specified sql is ignored
* b) if token is null sql is used
* c) if both are null the row will not be used
*/
class Row(val name: String, val token: Option[String], val filter: Option[String], val sql: Option[String])
/**
* Creates a config object of the correct type
* @tparam T
* @tparam R
*/
trait ConfigFactory[T <: Config, R <: Row] {
def create(base: Config, row: R): T
}
protected[quality] object ConfigLoader {
/**
* Loads view configurations from a given DataFrame for ruleSuiteId. Wherever token is present loader will be called and the filter optionally applied.
* @return A tuple of ViewConfig's and the names of rows which had unexpected content (either token or sql must be present)
*/
def loadConfigs[T <: Config, R <: Row : Encoder](loader: DataFrameLoader, configDF: DataFrame,
ruleSuiteIdColumn: Column,
ruleSuiteVersionColumn: Column,
ruleSuiteId: Id,
name: Column,
token: Column,
filter: Column,
sql: Column,
extra: Column *
)(implicit factory: ConfigFactory[T,R]): (Seq[T], Set[String]) = {
val filtered =
configDF.filter(
ruleSuiteIdColumn === ruleSuiteId.id && ruleSuiteVersionColumn === ruleSuiteId.version)
.select(Seq(name.as("name"), token.as("token"), filter.as("filter"), sql.as("sql")) ++ extra :_ *)
.as[R]
loadConfigs[T, R](loader, filtered)
}
/**
* Loads view configurations from a given DataFrame. Wherever token is present loader will be called and the filter optionally applied.
* @return A tuple of ViewConfig's and the names of rows which had unexpected content (either token or sql must be present)
*/
def loadConfigs[T <: Config, R <: Row : Encoder](loader: DataFrameLoader, configDF: DataFrame,
name: Column,
token: Column,
filter: Column,
sql: Column,
extra: Column *
)(implicit factory: ConfigFactory[T,R]): (Seq[T], Set[String]) = {
val filtered =
configDF
.select(Seq(name.as("name"), token.as("token"), filter.as("filter"), sql.as("sql")) ++ extra :_ *)
.as[R]
loadConfigs[T, R](loader, filtered)
}
/**
* Perform the actual load against a pre-prepared dataset
* @return A tuple of ViewConfig's and the names of rows which had unexpected content (either token or sql must be present)
*/
protected[quality] def loadConfigs[T <: Config, R <: Row](loader: DataFrameLoader, filtered: Dataset[R])(implicit factory: ConfigFactory[T,R]): (Seq[T], Set[String]) = {
val rejects = {
import filtered.sparkSession.implicits._
filtered.filter("token is null and sql is null").select("name").as[String].collect().toSet
}
val rows = filtered.filter("not(token is null and sql is null)").collect().toSeq
(
rows.map{ vr =>
factory.create(
new Config( vr.name,
vr.token.fold[Either[DataFrame,String]]( Right(vr.sql.get ) ){ token =>
val df = loader.load(token)
Left(vr.filter.fold(df)( df.filter(_) ))
}
), vr)
}, rejects)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy