
com.databricks.labs.automl.inference.InferencePipeline.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automatedml_2.11 Show documentation
Show all versions of automatedml_2.11 Show documentation
Databricks Labs AutoML toolkit
The newest version!
package com.databricks.labs.automl.inference
import com.databricks.labs.automl.executor.AutomationConfig
import com.databricks.labs.automl.feature.structures.NominalIndexCollection
import com.databricks.labs.automl.pipeline.FeaturePipeline
import com.databricks.labs.automl.sanitize.Scaler
import com.databricks.labs.automl.utils.{AutomationTools, DataValidation}
import ml.dmlc.xgboost4j.scala.spark.{
XGBoostClassificationModel,
XGBoostRegressionModel
}
import org.apache.spark.ml.classification._
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.regression._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import com.databricks.labs.automl.inference.InferenceConfig._
import org.apache.spark.ml.Pipeline
class InferencePipeline(df: DataFrame)
extends AutomationConfig
with AutomationTools
with DataValidation
with InferenceTools {
/**
* Data Prep to:
* - select only the initial columns that were present at the beginning of the training run
* - Convert the datetime entities to correct actionable types
* - StringIndex categorical (text or ordinal) fields
* - Fill NA with the values that were used during the training run for each column
* @return The courier object InferencePayload[, , ]
*/
private def dataPreparation(): InferencePayload = {
// Filter out any non-used fields that may be included in future data sets that weren't part of model training
// TODO - Have to remove this temporarily
// val initialColumnRestriction = df.select(_inferenceConfig.inferenceDataConfig.startingColumns map col:_*)
// Build the feature Pipeline
val featurePipelineObject = new FeaturePipeline(df, isInferenceRun = true)
.setLabelCol(_inferenceConfig.inferenceDataConfig.labelCol)
.setFeatureCol(_inferenceConfig.inferenceDataConfig.featuresCol)
.setDateTimeConversionType(
_inferenceConfig.inferenceDataConfig.dateTimeConversionType
)
// Get the StringIndexed DataFrame, the fields that are set for modeling, and all fields combined.
val (indexedData, columnsForModeling, allColumns) = featurePipelineObject
.makeFeaturePipeline(_inferenceConfig.inferenceDataConfig.fieldsToIgnore)
val outputData = if (_inferenceConfig.inferenceSwitchSettings.naFillFlag) {
indexedData.na
.fill(
_inferenceConfig.featureEngineeringConfig.naFillConfig.categoricalColumns
)
.na
.fill(
_inferenceConfig.featureEngineeringConfig.naFillConfig.numericColumns
)
} else {
indexedData
}
createInferencePayload(outputData, columnsForModeling, allColumns)
}
/**
* Helper method for creating the Feature Vector for modeling / feature engineering tasks
* @param payload InferencePayload object that contains:
* - The DataFrame
* - The List of Columns to be included in the Feature Vector
* - The Full List of Columns (including ignored columns used for post-inference joining, etc.)
* @return a new InferencePayload object (with the DataFrame now including a feature vector)
*/
private def createFeatureVector(
payload: InferencePayload
): InferencePayload = {
val vectorAssembler = new VectorAssembler()
.setInputCols(payload.modelingColumns)
.setOutputCol(_inferenceConfig.inferenceDataConfig.featuresCol)
val vectorAppliedDataFrame = vectorAssembler.transform(payload.data)
createInferencePayload(
vectorAppliedDataFrame,
payload.modelingColumns,
payload.allColumns ++ Array(
_inferenceConfig.inferenceDataConfig.featuresCol
)
)
}
/**
* Helper method for applying one hot encoding to the feature vector, if used in the original modeling run
* @param payload InferencePayload object
* @return a new InferencePayload object (the DataFrame, with and updated feature vector, and the field listings
* now having any previous StringIndexed fields converted to OneHotEncoded fields.)
*/
private def oneHotEncodingTransform(
payload: InferencePayload
): InferencePayload = {
val featurePipeline =
new FeaturePipeline(payload.data, isInferenceRun = true)
.setLabelCol(_inferenceConfig.inferenceDataConfig.labelCol)
.setFeatureCol(_inferenceConfig.inferenceDataConfig.featuresCol)
.setDateTimeConversionType(
_inferenceConfig.inferenceDataConfig.dateTimeConversionType
)
val (returnData, vectorCols, allCols) = featurePipeline.applyOneHotEncoding(
payload.modelingColumns,
payload.allColumns
)
createInferencePayload(returnData, vectorCols, allCols)
}
/**
* Private helper functionn for recreating the feature interaction fields that were specified during model creation
* @param payload Previous step payload of data, columns in feature vector, and all columns
* @return a new InferencePayload object that has the added feature interaction fields.
* @since 0.6.2
* @author Ben Wilson, Databricks
*/
private def createFeatureInteractions(
payload: InferencePayload
): InferencePayload = {
// Interact the columns
val interactions =
_inferenceConfig.featureEngineeringConfig.featureInteractionConfig.interactions
var mutatingDataFrame = payload.data
for (c <- interactions) {
mutatingDataFrame =
mutatingDataFrame.withColumn(c.outputName, col(c.left) * col(c.right))
}
val parsedNames = interactions.map { x =>
(x.leftDataType, x.rightDataType) match {
case ("nominal", "nominal") =>
NominalIndexCollection(x.outputName, indexCheck = true)
case _ => NominalIndexCollection(x.outputName, indexCheck = false)
}
}
val nominalFields = parsedNames
.filter(x => x.indexCheck)
.map(x => x.name)
val indexers = nominalFields.map { x =>
new StringIndexer()
.setHandleInvalid("keep")
.setInputCol(x)
.setOutputCol(x + "_si")
}
val pipeline = new Pipeline().setStages(indexers).fit(mutatingDataFrame)
val adjustedFields = parsedNames.map { x =>
if (x.indexCheck) x.name + "_si" else x.name
}
createInferencePayload(
pipeline.transform(mutatingDataFrame),
payload.modelingColumns ++ adjustedFields,
payload.allColumns ++ adjustedFields
)
}
/**
* Method for performing all configured FeatureEngineering tasks as set in the InferenceMainConfig
* @param payload InferencePayload object
* @return new InferencePayload object with all actions applied to the Dataframe and associated field listings
* that were originally performed in model training.
*/
private def executeFeatureEngineering(
payload: InferencePayload
): InferencePayload = {
// Variance Filtering
val variancePayload =
if (_inferenceConfig.inferenceSwitchSettings.varianceFilterFlag) {
val fieldsToRemove =
_inferenceConfig.featureEngineeringConfig.varianceFilterConfig.fieldsRemoved
removeArrayOfColumns(payload, fieldsToRemove)
} else payload
// Outlier Filtering
val outlierPayload =
if (_inferenceConfig.inferenceSwitchSettings.outlierFilterFlag) {
// apply filtering in a foreach
var outlierData = variancePayload.data
_inferenceConfig.featureEngineeringConfig.outlierFilteringConfig.fieldRemovalMap
.foreach { x =>
val field = x._1
val direction = x._2._2
val value = x._2._1
outlierData = direction match {
case "greater" => outlierData.filter(col(field) <= value)
case "lesser" => outlierData.filter(col(field) >= value)
}
}
createInferencePayload(
outlierData,
variancePayload.modelingColumns,
variancePayload.allColumns
)
} else variancePayload
// Covariance Filtering
val covariancePayload =
if (_inferenceConfig.inferenceSwitchSettings.covarianceFilterFlag) {
val fieldsToRemove =
_inferenceConfig.featureEngineeringConfig.covarianceFilteringConfig.fieldsRemoved
removeArrayOfColumns(outlierPayload, fieldsToRemove)
} else outlierPayload
// Pearson Filtering
val pearsonPayload =
if (_inferenceConfig.inferenceSwitchSettings.pearsonFilterFlag) {
val fieldsToRemove =
_inferenceConfig.featureEngineeringConfig.pearsonFilteringConfig.fieldsRemoved
removeArrayOfColumns(covariancePayload, fieldsToRemove)
} else covariancePayload
// Build the Interacted Features
val featureInteractionPayload =
if (_inferenceConfig.inferenceSwitchSettings.featureInteractionFlag) {
createFeatureInteractions(pearsonPayload)
} else pearsonPayload
// Build the Feature Vector
val featureVectorPayload = createFeatureVector(featureInteractionPayload)
// OneHotEncoding
val oneHotEncodedPayload =
if (_inferenceConfig.inferenceSwitchSettings.oneHotEncodeFlag) {
oneHotEncodingTransform(featureVectorPayload)
} else featureVectorPayload
// Scaling
val scaledPayload =
if (_inferenceConfig.inferenceSwitchSettings.scalingFlag) {
val scalerConfig =
_inferenceConfig.featureEngineeringConfig.scalingConfig
val scaledData = new Scaler(oneHotEncodedPayload.data)
.setFeaturesCol(_inferenceConfig.inferenceDataConfig.featuresCol)
.setScalerType(scalerConfig.scalerType)
.setScalerMin(scalerConfig.scalerMin)
.setScalerMax(scalerConfig.scalerMax)
.setStandardScalerMeanMode(scalerConfig.standardScalerMeanFlag)
.setStandardScalerStdDevMode(scalerConfig.standardScalerStdDevFlag)
.setPNorm(scalerConfig.pNorm)
.scaleFeatures()
createInferencePayload(
scaledData,
oneHotEncodedPayload.modelingColumns,
oneHotEncodedPayload.allColumns
)
} else oneHotEncodedPayload
// yield the Data and the Columns for the payload
scaledPayload
}
/**
* Helper method for loading and applying a transformation on the Dataframe from FeatureEngineering tasks.
* @param data The Dataframe from feature engineering output.
* @return A Dataframe with a prediction and/or probability column applied.
*/
private def loadModelAndInfer(data: DataFrame): DataFrame = {
val modelFamily = _inferenceConfig.inferenceModelConfig.modelFamily
val modelType = _inferenceConfig.inferenceModelConfig.modelType
val modelLoadPath = _inferenceConfig.inferenceModelConfig.modelPathLocation
// load the model and transform the dataframe to batch predict on the data
modelFamily match {
case "XGBoost" =>
modelType match {
case "regressor" =>
val xgboostRegressor = XGBoostRegressionModel.load(modelLoadPath)
xgboostRegressor.transform(data)
case "classifier" =>
val xgboostClassifier =
XGBoostClassificationModel.load(modelLoadPath)
xgboostClassifier.transform(data)
}
case "RandomForest" =>
modelType match {
case "regressor" =>
val rfRegressor = RandomForestRegressionModel.load(modelLoadPath)
rfRegressor.transform(data)
case "classifier" =>
val rfClassifier =
RandomForestClassificationModel.load(modelLoadPath)
rfClassifier.transform(data)
}
case "GBT" =>
modelType match {
case "regressor" =>
val gbtRegressor = GBTRegressionModel.load(modelLoadPath)
gbtRegressor.transform(data)
case "classifier" =>
val gbtClassifier = GBTClassificationModel.load(modelLoadPath)
gbtClassifier.transform(data)
}
case "Trees" =>
modelType match {
case "regressor" =>
val treesRegressor = DecisionTreeRegressionModel.load(modelLoadPath)
treesRegressor.transform(data)
case "classifier" =>
val treesClassifier =
DecisionTreeClassificationModel.load(modelLoadPath)
treesClassifier.transform(data)
}
case "MLPC" =>
val mlpcClassifier =
MultilayerPerceptronClassificationModel.load(modelLoadPath)
mlpcClassifier.transform(data)
case "LinearRegression" =>
val linearRegressor = LinearRegressionModel.load(modelLoadPath)
linearRegressor.transform(data)
case "LogisticRegression" =>
val logisticRegressor = LogisticRegressionModel.load(modelLoadPath)
logisticRegressor.transform(data)
case "SVM" =>
val svmClassifier = LinearSVCModel.load(modelLoadPath)
svmClassifier.transform(data)
}
}
/**
* Helper method for loading the InferenceMainConfig from a DataFrame that has been written to a storage location
* during model training. After loading the Dataframe, the value in row 1 column 1 will be extracted, converted
* to json, converted to an instance of InferenceMainConfig, and finally used to set the current state of this
* class' MainInferenceConfig.
* @param inferenceDataFrameSaveLocation The storage location path of the Dataframe.
*/
private def getAndSetConfigFromDataFrame(
inferenceDataFrameSaveLocation: String
): Unit = {
val inferenceDataFrame = spark.read.load(inferenceDataFrameSaveLocation)
val config = extractInferenceConfigFromDataFrame(inferenceDataFrame)
setInferenceConfig(config)
}
/**
* Main private method for executing an inference run.
* @return A Dataframe with an applied model prediction.
*/
private def inferencePipeline(): DataFrame = {
// Run through the Data Preparation steps as a prelude to Feature Engineering
val prep = dataPreparation()
// Execute the Feature Engineering that was performed during initial model training
val featureEngineering = executeFeatureEngineering(prep)
// Execute the model inference and return a transformed DataFrame.
loadModelAndInfer(featureEngineering.data)
}
/**
* Public method for performing an inference run from a stored InferenceConfig Dataframe location.
* @param inferenceConfigDFPath Path on storage of where the Dataframe was written during the training run.
* @return A Dataframe with predictions based on a pre-trained model.
*/
def runInferenceFromStoredDataFrame(
inferenceConfigDFPath: String
): DataFrame = {
// Load the Dataframe containing the configuration and set the InferenceMainConfig
getAndSetConfigFromDataFrame(inferenceConfigDFPath)
inferencePipeline()
}
/**
* Public method for performing an inference run from a supplied inference config string.
* @param jsonConfig the saved inference config from a previous run as string-encoded json
* @return A Dataframe with prediction based on a pre-trained model.
*/
def runInferenceFromJSONConfig(jsonConfig: String): DataFrame = {
val config = convertJsonConfigToClass(jsonConfig)
setInferenceConfig(config)
inferencePipeline()
}
def getInferenceConfig: InferenceMainConfig = _inferenceConfig
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy