
com.databricks.labs.automl.sanitize.DataSanitizer.scala Maven / Gradle / Ivy
package com.databricks.labs.automl.sanitize
import com.databricks.labs.automl.exceptions.BooleanFieldFillException
import com.databricks.labs.automl.inference.{NaFillConfig, NaFillPayload}
import com.databricks.labs.automl.model.tools.split.PerformanceSettings
import com.databricks.labs.automl.utils.structures.FeatureEngineeringEnums.FeatureEngineeringEnums
import com.databricks.labs.automl.utils.structures.{
FeatureEngineeringAllowables,
FeatureEngineeringEnums
}
import com.databricks.labs.automl.utils.{
DataValidation,
SchemaUtils,
SparkSessionWrapper
}
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.collection.mutable.ArrayBuffer
class DataSanitizer(data: DataFrame)
extends DataValidation
with SparkSessionWrapper {
private var _labelCol = "label"
private var _featureCol = "features"
private var _numericFillStat = "mean"
private var _characterFillStat = "max"
private var _modelSelectionDistinctThreshold = 10
private var _fieldsToIgnoreInVector = Array.empty[String]
private var _filterPrecision: Double = 0.01
private var _parallelism: Int = 20
private var _categoricalNAFillMap: Map[String, String] =
Map.empty[String, String]
private var _numericNAFillMap: Map[String, AnyVal] = Map.empty[String, AnyVal]
private var _characterNABlanketFill: String = ""
private var _numericNABlanketFill: Double = 0.0
private var _naFillMode: String = "auto"
final private val _allowableNAFillModes: List[String] =
List(
"auto",
"mapFill",
"blanketFillAll",
"blanketFillCharOnly",
"blanketFillNumOnly"
)
def setLabelCol(value: String): this.type = {
_labelCol = value
this
}
def setFeatureCol(value: String): this.type = {
_featureCol = value
this
}
def setNumericFillStat(value: String): this.type = {
_numericFillStat = value
this
}
def setCharacterFillStat(value: String): this.type = {
_characterFillStat = value
this
}
def setModelSelectionDistinctThreshold(value: Int): this.type = {
_modelSelectionDistinctThreshold = value
this
}
def setFieldsToIgnoreInVector(value: Array[String]): this.type = {
_fieldsToIgnoreInVector = value
this
}
def setParallelism(value: Int): this.type = {
_parallelism = value
this
}
def setFilterPrecision(value: Double): this.type = {
if (value == 0.0)
println(
"Warning! Precision of 0 is an exact calculation of quantiles and may not be performant!"
)
_filterPrecision = value
this
}
def setCategoricalNAFillMap(value: Map[String, String]): this.type = {
_categoricalNAFillMap = value
this
}
def setNumericNAFillMap(value: Map[String, AnyVal]): this.type = {
_numericNAFillMap = value
this
}
def setCharacterNABlanketFillValue(value: String): this.type = {
_characterNABlanketFill = value
this
}
def setNumericNABlanketFillValue(value: Double): this.type = {
_numericNABlanketFill = value
this
}
/**
* Setter for determining the fill mode for handling na values.
*
* @param value Mode for na fill
* Available modes:
* auto : Stats-based na fill for fields. Usage of .setNumericFillStat and
* .setCharacterFillStat will inform the type of statistics that will be used to fill.
* mapFill : Custom by-column overrides to 'blanket fill' na values on a per-column
* basis. The categorical (string) fields are set via .setCategoricalNAFillMap while the
* numeric fields are set via .setNumericNAFillMap.
* blanketFillAll : Fills all fields based on the values specified by
* .setCharacterNABlanketFillValue and .setNumericNABlanketFillValue. All NA's for the
* appropriate types will be filled in accordingly throughout all columns.
* blanketFillCharOnly Will use statistics to fill in numeric fields, but will replace
* all categorical character fields na values with a blanket fill value.
* blanketFillNumOnly Will use statistics to fill in character fields, but will replace
* all numeric fields na values with a blanket value.
* @author Ben Wilson, Databricks
* @since 0.5.2
* @throws IllegalArgumentException if mode is not supported
*/
@throws(classOf[IllegalArgumentException])
def setNAFillMode(value: String): this.type = {
require(
_allowableNAFillModes.contains(value),
s"NA fill mode $value is not supported. Must be one of : " +
s"${_allowableNAFillModes.mkString(", ")}"
)
_naFillMode = value
this
}
def getLabel: String = _labelCol
def getFeatureCol: String = _featureCol
def getNumericFillStat: String = _numericFillStat
def getCharacterFillStat: String = _characterFillStat
def getModelSelectionDistinctThreshold: Int = _modelSelectionDistinctThreshold
def getFieldsToIgnoreInVector: Array[String] = _fieldsToIgnoreInVector
def getParallelism: Int = _parallelism
def getFilterPrecision: Double = _filterPrecision
def getCategoricalNAFillMap: Map[String, String] = _categoricalNAFillMap
def getNumericNAFillMap: Map[String, AnyVal] = _numericNAFillMap
def getCharacterNABlanketFillValue: String = _characterNABlanketFill
def getNumericNABlanketFillValue: Double = _numericNABlanketFill
def getNaFillMode: String = _naFillMode
private var _labelValidation: Boolean = false
def labelValidationOn(): this.type = {
_labelValidation = true
this
}
private def convertLabel(df: DataFrame): DataFrame = {
val stringIndexer = getLabelIndexer(df)
stringIndexer
.fit(data)
.transform(data)
.withColumn(this._labelCol, col(s"${this._labelCol}_si"))
.drop(this._labelCol + "_si")
}
def getLabelIndexer(df: DataFrame): StringIndexer = {
new StringIndexer()
.setInputCol(this._labelCol)
.setOutputCol(this._labelCol + "_si")
}
private def refactorLabel(df: DataFrame, labelColumn: String): DataFrame = {
SchemaUtils
.extractSchema(df.schema)
.foreach(
x =>
x.fieldName match {
case `labelColumn` =>
x.dataType match {
case StringType => labelValidationOn()
case BooleanType => labelValidationOn()
case BinaryType => labelValidationOn()
case _ => None
}
case _ => None
}
)
if (_labelValidation) convertLabel(df) else df
}
private def metricConversion(metric: String): String = {
val allowableFillArray = Array("min", "25p", "mean", "median", "75p", "max")
assert(
allowableFillArray.contains(metric),
s"The metric supplied, '$metric' is not in: " +
s"${invalidateSelection(metric, allowableFillArray)}"
)
val summaryMetric = metric match {
case "25p" => "25%"
case "median" => "50%"
case "75p" => "75%"
case _ => metric
}
summaryMetric
}
private def getBatches(items: List[String]): Array[List[String]] = {
val batches = ArrayBuffer[List[String]]()
val batchSize = items.length / _parallelism
for (i <- 0 to items.length by batchSize) {
batches.append(items.slice(i, i + batchSize))
}
batches.toArray
}
private def getFieldsAndFillable(df: DataFrame,
columnList: List[String],
statistics: String): DataFrame = {
val readyDF = df.repartition(PerformanceSettings.parTasks).cache
readyDF.foreach(_ => ())
val selectionColumns = "Summary" +: columnList
val x = if (statistics.isEmpty) {
val colBatches = getBatches(columnList)
colBatches
.map { batch =>
readyDF
.select(batch map col: _*)
.summary()
.select("Summary" +: batch map col: _*)
}
.seq
.toArray
.reduce((x, y) => x.join(broadcast(y), Seq("Summary")))
} else {
readyDF
.summary(statistics.replaceAll(" ", "").split(","): _*)
.select(selectionColumns map col: _*)
}
readyDF.unpersist(true)
x
}
private def assemblePayload(df: DataFrame,
fieldList: List[String],
filterCondition: String): Array[(String, Any)] = {
val summaryStats = getFieldsAndFillable(df, fieldList, filterCondition)
.drop(col("Summary"))
val summaryColumns = summaryStats.columns
val summaryValues = summaryStats.collect()(0).toSeq.toArray
summaryColumns.zip(summaryValues)
}
private def getCategoricalFillType(value: String): FeatureEngineeringEnums = {
value match {
case "min" => FeatureEngineeringEnums.MIN
case "max" => FeatureEngineeringEnums.MAX
}
}
/**
* Boolean filling based on the filterCondition for categorical data (min or max)
* @param df DataFrame containing BooleanType fields that may have null values
* @param fieldList List of the Boolean Fields
* @param filterCondition The setting of whether to use min or max values based on the data to fill na's
* @return The fill config data for the Boolean type columns consisting of ColumnName, Boolean fill value
* @since 0.6.2
* @author Ben Wilson, Databricks
* @throws BooleanFieldFillException if the mode setting for selecting the fill value is not supported.
*/
@throws(classOf[BooleanFieldFillException])
private def getBooleanFill(
df: DataFrame,
fieldList: List[String],
filterCondition: String
): Array[(String, Boolean)] = {
val filterSelection = getCategoricalFillType(filterCondition)
fieldList
.map(x => {
val booleanFieldStats =
df.select(x)
.groupBy(x)
.agg(count(col(x)).alias(FeatureEngineeringEnums.COUNT_COL.value))
val sortedStats = filterSelection match {
case FeatureEngineeringEnums.MIN =>
booleanFieldStats
.orderBy(col(FeatureEngineeringEnums.COUNT_COL.value).asc)
.head(1)
case FeatureEngineeringEnums.MAX =>
booleanFieldStats
.orderBy(col(FeatureEngineeringEnums.COUNT_COL.value).desc)
.head(1)
case _ =>
throw BooleanFieldFillException(
x,
filterCondition,
FeatureEngineeringAllowables.ALLOWED_CATEGORICAL_FILL_MODES.values
)
}
(x, sortedStats.head.getBoolean(0))
})
.toArray
}
/**
* Helper method for extraction the fields types based on the schema and calculating the statistics to be used to
* determine fill values for the columns.
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @return NaFillConfig : A mapping for numeric and string fields that represents the values to put in for each column.
* @since 0.1.0
* @author Ben Wilson, Databricks
*/
private def payloadExtraction(df: DataFrame): NaFillPayload = {
val typeExtract =
SchemaUtils.extractTypes(df, _labelCol, _fieldsToIgnoreInVector)
val numericPayload =
assemblePayload(
df,
typeExtract.numericFields,
metricConversion(_numericFillStat)
)
val characterPayload =
assemblePayload(
df,
typeExtract.categoricalFields,
metricConversion(_characterFillStat)
)
val booleanPayload = getBooleanFill(
df,
typeExtract.booleanFields,
metricConversion(_characterFillStat)
)
NaFillPayload(characterPayload, numericPayload, booleanPayload)
}
/**
* Helper method for ensuring that the label column isn't overridden
*
* @param payload Array of field name, value for overriding of numeric fields.
* @return Map of Field name to fill value converted to Double type.
* @since 0.5.2
* @author Ben Wilson, Databricks
*/
private def numericMapper(
payload: Array[(String, Any)]
): Map[String, Double] = {
val buffer = new ArrayBuffer[(String, Double)]
payload.map(
x =>
x._1 match {
case x._1 if x._1 != _labelCol =>
try {
buffer += ((x._1, x._2.toString.toDouble))
} catch {
case _: Exception => None
}
case _ => None
}
)
buffer.toArray.toMap
}
/**
* Helper method for ensuring that the label column isn't overridden
*
* @param payload Array of field name, value for overriding character fields.
* @return Map of Field name to fill value convert to String type.
* @since 0.5.2
* @author Ben Wilson, Databricks
*/
private def characterMapper(
payload: Array[(String, Any)]
): Map[String, String] = {
val buffer = new ArrayBuffer[(String, String)]
payload.map(
x =>
x._1 match {
case x._1 if x._1 != _labelCol =>
try {
buffer += ((x._1, x._2.toString))
} catch {
case _: Exception => None
}
case _ => None
}
)
buffer.toArray.toMap
}
/**
* Helper method for generating a statistics-based approach for calculating 'smart fillable' values for na's in the
* feature vector fields.
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @return NaFillConfig : A mapping for numeric and string fields that represents the values to put in for each column.
* @since 0.1.0
* @author Ben Wilson, Databricks
*/
private def fillMissing(df: DataFrame): NaFillConfig = {
val payloads = payloadExtraction(df)
val numericMapping = numericMapper(payloads.numeric)
val characterMapping = characterMapper(payloads.categorical)
NaFillConfig(
numericColumns = numericMapping,
categoricalColumns = characterMapping,
booleanColumns = payloads.boolean.toMap
)
}
/**
* Private method for applying a full blanket na fill on all fields to be involved in the feature vector.
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @return NaFillConfig : A mapping for numeric and string fields that represents the values to put in for each column.
* @since 0.5.2
* @author Ben Wilson, Databricks
*/
private def blanketNAFill(df: DataFrame): NaFillConfig = {
val payloadTypes =
SchemaUtils.extractTypes(df, _labelCol, _fieldsToIgnoreInVector)
val characterBuffer = new ArrayBuffer[(String, Any)]
val numericBuffer = new ArrayBuffer[(String, Any)]
payloadTypes.numericFields.foreach(
x => numericBuffer += ((x, _numericNABlanketFill))
)
payloadTypes.categoricalFields.foreach(
x => characterBuffer += ((x, _characterNABlanketFill))
)
//TODO: update Boolean overrides.
NaFillConfig(
characterMapper(characterBuffer.toArray),
numericMapper(numericBuffer.toArray),
payloadTypes.booleanFields.map(x => (x, false)).toMap
)
}
/**
* Private method for applying a full blanket na fill on only character fields to be involved in the feature vector.
* Numeric fields will use the stats mode defined in .setNumericFillStat
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @return NaFillConfig : A mapping for numeric and string fields that represents the values to put in for each column.
* @since 0.5.2
* @author Ben Wilson, Databricks
*/
private def blanketFillCharOnly(df: DataFrame): NaFillConfig = {
val payloads = fillMissing(df)
val buffer = new ArrayBuffer[(String, String)]
payloads.categoricalColumns.map(
x => buffer += ((x._1, _characterNABlanketFill))
)
NaFillConfig(
characterMapper(buffer.toArray),
payloads.numericColumns,
payloads.booleanColumns
)
}
/**
* Private method for applying a full blanket na fill on only numeric fields to be involved in the feature vector.
* Character fields will use the stats mode defined in .setCharacterFillStat
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @return NaFillConfig : A mapping for numeric and string fields that represents the values to put in for each column.
* @since 0.5.2
* @author Ben Wilson, Databricks
*/
private def blanketFillNumOnly(df: DataFrame): NaFillConfig = {
val payloads = fillMissing(df)
val buffer = new ArrayBuffer[(String, Double)]
payloads.numericColumns.map(x => buffer += ((x._1, _numericNABlanketFill)))
NaFillConfig(
payloads.categoricalColumns,
numericMapper(buffer.toArray),
payloads.booleanColumns
)
}
/**
* Validation run-time check for supplied maps, if used.
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @since 0.5.2
* @author Ben Wilson, Databricks
* @throws IllegalArgumentException if a map value refers to a column not in the dataset
* @throws UnsupportedOperationException if no map overrides have been specified in the run configuration
*/
@throws(classOf[UnsupportedOperationException])
@throws(classOf[IllegalArgumentException])
private def validateMapSchemaMembership(df: DataFrame): Unit = {
val suppliedSchema = df.schema.names
if (_numericNAFillMap.nonEmpty)
_numericNAFillMap.keys.foreach(
x =>
require(
suppliedSchema.contains(x),
s"Field $x supplied in .setNumericNAFillMap() is not a valid column name in the DataFrame."
)
)
if (_categoricalNAFillMap.nonEmpty)
_categoricalNAFillMap.keys.foreach(
x =>
require(
suppliedSchema.contains(x),
s"Field $x supplied in .setCategoricalNAFillMap() is not a valid column name in the DataFrame."
)
)
if (_categoricalNAFillMap.isEmpty && _numericNAFillMap.isEmpty)
throw new UnsupportedOperationException(
s"Map Fill mode has been defined for NA Fill but " +
s"no map overrides have been specified. Check configuration and ensure that either categoricalNAFillMap " +
s"or numericNAFillMap values have been set."
)
}
/**
* Private method for submitting a Map of categorical and numeric overrides for na fill based on column name -> value
* as set in .setCategoricalNAFillMap and .setNumericNAFillMap any fields not included in these maps will use the
* statistics-based approaches to fill na's.
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @return NaFillConfig : A mapping for numeric and string fields that represents the values to put in for each column.
* @since 0.5.2
* @author Ben Wilson, Databricks
*/
private def mapNAFill(df: DataFrame): NaFillConfig = {
validateMapSchemaMembership(df)
val payloads = fillMissing(df)
val numBuffer = new ArrayBuffer[(String, Double)]
val charBuffer = new ArrayBuffer[(String, String)]
payloads.categoricalColumns.map(
x =>
x._1 match {
case x._1 if _categoricalNAFillMap.contains(x._1) =>
charBuffer += ((x._1, _categoricalNAFillMap(x._1).toString))
case _ => charBuffer += x
}
)
payloads.numericColumns.map(
x =>
x._1 match {
case x._1 if _numericNAFillMap.contains(x._1) =>
numBuffer += ((x._1, _numericNAFillMap(x._1).toString.toDouble))
case _ => numBuffer += x
}
)
NaFillConfig(
characterMapper(charBuffer.toArray),
numericMapper(numBuffer.toArray),
payloads.booleanColumns.map(x => x._1 -> false)
)
}
/**
* Private method for handling control logic depending on na fill mode selected
*
* @param df A DataFrame that has already had the label field converted to the appropriate (Double) Type
* @return NaFillConfig : A mapping for numeric and string fields that represents the values to put in for each column.
* @since 0.5.2
* @author Ben Wilson, Databricks
*/
private def fillNA(df: DataFrame): NaFillConfig = {
_naFillMode match {
case "auto" => fillMissing(df)
case "blanketFillAll" =>
blanketNAFill(df)
case "blanketFillCharOnly" => blanketFillCharOnly(df)
case "blanketFillNumOnly" => blanketFillNumOnly(df)
case "mapFill" => mapNAFill(df)
case _ =>
throw new UnsupportedOperationException(
s"The naFill Mode ${_naFillMode} is not supported. " +
s"Must be one of: ${_allowableNAFillModes.mkString(", ")}"
)
}
}
def decideModel(): String = {
val uniqueLabelCounts = data
.select(approx_count_distinct(_labelCol, rsd = _filterPrecision))
.rdd
.map(row => row.getLong(0))
.take(1)(0)
val decision = uniqueLabelCounts match {
case x if x <= _modelSelectionDistinctThreshold => "classifier"
case _ => "regressor"
}
decision
}
def generateCleanData(
naFillConfig: NaFillConfig = null,
refactorLabelFlag: Boolean = true,
decidedModel: String = ""
): (DataFrame, NaFillConfig, String) = {
val preFilter = if (refactorLabelFlag) {
refactorLabel(data, _labelCol)
} else {
data
}
val fillMap = if (naFillConfig == null) {
fillNA(preFilter)
} else {
naFillConfig
}
val filledData = preFilter.na
.fill(fillMap.numericColumns)
.na
.fill(fillMap.categoricalColumns)
.na
.fill(fillMap.booleanColumns)
.filter(col(_labelCol).isNotNull)
.filter(!col(_labelCol).isNaN)
.toDF()
if (decidedModel != null && decidedModel.nonEmpty) {
(filledData, fillMap, decidedModel)
} else {
(filledData, fillMap, decideModel())
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy