
com.databricks.labs.automl.pyspark.FamilyRunnerUtil.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automatedml_2.11 Show documentation
Show all versions of automatedml_2.11 Show documentation
Databricks Labs AutoML toolkit
The newest version!
package com.databricks.labs.automl.pyspark
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.spark.sql.DataFrame
import com.databricks.labs.automl.executor.config.{
ConfigurationGenerator,
InstanceConfig
}
import com.databricks.labs.automl.executor.FamilyRunner
import com.databricks.labs.automl.pyspark.utils.Utils
import com.databricks.labs.automl.utils.SparkSessionWrapper
import com.databricks.labs.automl.pipeline.inference.PipelineModelInference
import org.apache.spark.ml.PipelineModel
object FamilyRunnerUtil extends SparkSessionWrapper {
lazy val objectMapper = new ObjectMapper()
def runFamilyRunner(df: DataFrame,
configs: String,
predictionType: String): Unit = {
import spark.implicits._
val firstMap = jsonToMap(configs)
val familyRunnerConfigs = buildArray(firstMap, predictionType)
//run the family runner
val runner = FamilyRunner(df, familyRunnerConfigs).executeWithPipeline()
runner.familyFinalOutput.modelReportDataFrame
.createOrReplaceTempView("modelReportDataFrame")
runner.familyFinalOutput.generationReportDataFrame
.createOrReplaceTempView("generationReportDataFrame")
runner.bestMlFlowRunId.toSeq
.toDF("model_family", "run_id")
.createOrReplaceTempView("bestMlFlowRunId")
}
def cleansNestedTypes(valuesMap: Map[String, Any]): Map[String, Any] = {
val cleanMap: scala.collection.mutable.Map[String, Any] =
scala.collection.mutable.Map()
if (valuesMap.contains("fieldsToIgnoreInVector")) {
cleanMap("fieldsToIgnoreInVector") = valuesMap("fieldsToIgnoreInVector")
.asInstanceOf[List[String]]
.toArray
}
if (valuesMap.contains("outlierFieldsToIgnore")) {
cleanMap("outlierFieldsToIgnore") = valuesMap("outlierFieldsToIgnore")
.asInstanceOf[List[String]]
.toArray
}
if (valuesMap.contains("numericBoundaries")) {
cleanMap("numericBoundaries") = valuesMap("numericBoundaries")
.asInstanceOf[Map[String, List[Any]]]
.flatMap({
case (k, v) => {
Map(k -> Tuple2(v.head.toString.toDouble, v(1).toString.toDouble))
}
})
}
cleanMap.toMap
}
def runMlFlowInference(mlFlowRunId: String,
modelFamily: String,
predictionType: String,
labelCol: String,
configs: String,
df: DataFrame): Unit = {
// TO DO add support for default configs
// generate the configs
val familyRunnerConfigs = ConfigurationGenerator.generateConfigFromMap(
modelFamily,
predictionType,
jsonToMap(configs)
)
// get logging config
val loggingConfig = familyRunnerConfigs.loggingConfig
// get pipeline model
val pipelineModel = PipelineModelInference.getPipelineModelByMlFlowRunId(
mlFlowRunId,
loggingConfig
)
// run inference on df and pipeline model from mlflow
pipelineModel
.transform(df.drop(labelCol))
.createOrReplaceTempView("inferenceDF")
}
def runPathInference(path: String, dataframe: DataFrame): Unit = {
// Read in the Pipeline
PipelineModel
.load(path)
.transform(dataframe)
.createOrReplaceTempView(viewName = "pathInferenceDF")
}
def runFeatureEngPipeline(df: DataFrame,
modelFamily: String,
predictionType: String,
configs: String): Unit = {
import spark.implicits._
val firstMap = jsonToMap(configs)
val familyRunnerConfigs = buildArray(firstMap, predictionType)
//run the family runner
val featureEngPipelineModel = FamilyRunner(df, familyRunnerConfigs)
.generateFeatureEngineeredPipeline(verbose = true)(modelFamily)
featureEngPipelineModel
.transform(df)
.createOrReplaceTempView(viewName = "featEngDf")
}
def buildArray(configs: Map[String, Any],
predictionType: String): Array[InstanceConfig] = {
configs
.asInstanceOf[Map[String, Map[String, Any]]]
.map({
case (key, rawValuesMap) => {
val valuesMap: Map[String, Any] = rawValuesMap ++ Utils
.cleansNestedTypes(rawValuesMap)
ConfigurationGenerator
.generateConfigFromMap(key, predictionType, valuesMap)
}
})
.toArray
}
def jsonToMap(message: String): Map[String, Any] = {
objectMapper.registerModule(DefaultScalaModule)
objectMapper.readValue(message, classOf[Map[String, Any]])
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy