
org.apache.spark.ml.automl.feature.BinaryEncoder.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automatedml_2.11 Show documentation
Show all versions of automatedml_2.11 Show documentation
Databricks Labs AutoML toolkit
The newest version!
package org.apache.spark.ml.automl.feature
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkException
import org.apache.spark.ml.attribute._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{
HasHandleInvalid,
HasInputCols,
HasOutputCols
}
import org.apache.spark.ml.util._
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{col, lit, udf}
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}
trait BinaryEncoderBase
extends Params
with HasHandleInvalid
with HasInputCols
with HasOutputCols {
/**
* Configuration of the Parameter for handling invalid entries in a previously modeled feature column.
*/
override val handleInvalid: Param[String] = new Param[String](
this,
"handleInvalid",
"Handling invalid data flag for utilizing the BinaryEncoderModel during transform method call." +
"Options are: 'keep' (encodes unknown values as Binary 0's as an unknown categorical class) or " +
"'error' (throw error if unknown value is introduced).",
ParamValidators.inArray(BinaryEncoder.supportedHandleInvalids)
)
setDefault(handleInvalid, BinaryEncoder.ERROR_INVALID)
/**
* Method for validating the resultant schema from the application of building and transforming using this
* encoder package. The purpose of validation is to ensure that the supplied input columns are of the correct
* binary or nominal (ordinal numeric) type and that the output columns will contain the correct number of columns
* based on the configuration set.
* @param schema The schema of the dataset supplied for training of the model or used in transforming using the model
* @param keepInvalid Boolean flag for whether to allow for an additional binary encoding value to be used for
* any values that were unknown at the time of model training, which will summarily be
* converted to a 'max binary value' of the encoding length + 1 with maximum n * "1" values.
* @return StructType that represents the transformed schema with additional output columns appended to the
* dataset structure.
* @since 0.5.3
* @throws UnsupportedOperationException if the configured input cols and output cols do not match one another in
* length.
* @author Ben Wilson, Databricks
*/
@throws(classOf[UnsupportedOperationException])
protected def validateAndTransformSchema(schema: StructType,
keepInvalid: Boolean): StructType = {
val inputColNames = $(inputCols)
val outputColNames = $(outputCols)
require(
inputColNames.length == outputColNames.length,
s"The supplied number of input columns " +
s"${inputColNames.length} to BinaryEncoder" +
s"do not match the output columns count ${outputColNames.length}.\n InputCols: ${inputColNames
.mkString(", ")}" +
s"\n OutputCols: ${outputColNames.mkString(", ")}"
)
// Validate that the supplied input columns are of numeric type
inputColNames.foreach(SchemaUtils.checkNumericType(schema, _))
val inputFields = $(inputCols).map(schema(_))
val outputFields = inputFields.zip(outputColNames).map {
case (inputField, outputColName) =>
BinaryEncoderCommon
.transformOutputColumnSchema(inputField, outputColName, keepInvalid)
}
outputFields.foldLeft(schema) {
case (newSchema, outputField) =>
StructType(newSchema.fields :+ outputField)
}
}
}
class BinaryEncoder(override val uid: String)
extends Estimator[BinaryEncoderModel]
with DefaultParamsWritable
with HasInputCols
with HasOutputCols
with BinaryEncoderBase {
def this() = this(Identifiable.randomUID("binaryEncoder"))
/**
* Setter for supplying the array of input columns to be encoded with the BinaryEncoder type
* @param values Array of column names
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
def setInputCols(values: Array[String]): this.type = set(inputCols, values)
/**
* Setter for supplying the array of output columns that are the result of running a .transform from a trained
* model on an appropriate dataset of compatible schema
* @param values Array of column names that will be generated through a .transform
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
def setOutputCols(values: Array[String]): this.type = set(outputCols, values)
/**
* Setter for supplying an optional 'keep' or 'error' (Default: 'error') for un-seen values that arrive into a
* pre-trained model. With the 'keep' setting, an additional vector position is added to the output column
* to ensure no collisions may exist with real data and the values throughout each of the Array[Double] locations
* in the DenseVector output will all be set to '1'
* @param value String: either 'keep' or 'error' (Default: 'error')
* @throws SparkException if the configuration value supplied is not either 'keep' or 'error'
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
@throws(classOf[SparkException])
def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
override def transformSchema(schema: StructType): StructType = {
val keepInvalid = $(handleInvalid) == BinaryEncoder.KEEP_INVALID
validateAndTransformSchema(schema, keepInvalid = keepInvalid)
}
/**
* Main fit method that will build a BinaryEncoder model from the data set and the configured input and output columns
* specified in the setters.
* The primary principle at work here is dimensionality reduction for the encoding of extremely high-cardinality
* StringIndexed columns. OneHotEncoding works extremely well for this purpose, but has the side-effect of
* requiring extremely large amounts of columns to be generated when performing OHE is increased memory pressure.
* This package allows for a lossy reduction in this space by distilling the information into a binary string
* encoding space that is dynamic based on the encoded length of the maximum nominal space as represented in binary
*
* @example e.g. if the cardinality of a nominal column is 113, the binary representation of that is 1110001.
* When using OHE, this would result in 113 (or 114 if allowing invalids) binary positions within a sparse
* vector, creating 113 or 114 columns in the dataset. However, using BinaryEncoder, we are left with 7
* (or 8, if allowing invalids) dense vector positions to capture the same amount of information.
*
* @note Due to the nature of this encoding and how the majority of models learn, this is seen as an information
* loss encoding. However, considering that high cardinality non-numeric nominal fields are frequently
* discarded due to the explosion of the data set, this is providing the ability to utilize high cardinality
* fields that otherwise would not be able to be included.
* @param dataset The dataset (or DataFrame) used in training the model
* @return BinaryEncoderModel - a serializable artifact that has the output schema and encoding embedded within it.
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
override def fit(dataset: Dataset[_]): BinaryEncoderModel = {
transformSchema(dataset.schema)
val mutatedSchema =
validateAndTransformSchema(dataset.schema, keepInvalid = false)
// Build an array of the size of the targeted binary arrays to be encoded
val categoricalColumnSizes = new Array[Int]($(outputCols).length)
// populate the Array of categorical column sizes with the size information for fitting of the model
val columnToScanIndices = $(outputCols).zipWithIndex.flatMap {
case (outputColName, idx) =>
val classCount =
AttributeGroup.fromStructField(mutatedSchema(outputColName)).size
if (classCount < 0) {
Some(idx)
} else {
categoricalColumnSizes(idx) = classCount
None
}
}
// If the metadata doesn't have the attribute information, extract it manually.
if (columnToScanIndices.length > 0) {
val inputColNames = columnToScanIndices.map($(inputCols)(_))
val outputColNames = columnToScanIndices.map($(outputCols)(_))
val attrGroups = BinaryEncoderCommon.getOutputAttrGroupFromData(
dataset,
inputColNames,
outputColNames
)
attrGroups.zip(columnToScanIndices).foreach {
case (attrGroup, idx) =>
categoricalColumnSizes(idx) = attrGroup.size
}
}
val model =
new BinaryEncoderModel(uid, categoricalColumnSizes).setParent(this)
copyValues(model)
}
override def copy(extra: ParamMap): BinaryEncoder = defaultCopy(extra)
}
object BinaryEncoder extends DefaultParamsReadable[BinaryEncoder] {
private[feature] val KEEP_INVALID: String = "keep"
private[feature] val ERROR_INVALID: String = "error"
private[feature] val supportedHandleInvalids: Array[String] =
Array(KEEP_INVALID, ERROR_INVALID)
override def load(path: String): BinaryEncoder = super.load(path)
}
class BinaryEncoderModel(override val uid: String,
val categorySizes: Array[Int])
extends Model[BinaryEncoderModel]
with BinaryEncoderBase
with MLWritable {
import BinaryEncoderModel._
/**
* Helper method for adjusting the sizes that the Binary Encoder will need to encode to capture the BinaryString
* representation of the Nominal data, adjusted for the keepInvalid flag.
* @since 0.5.3
*/
private def getConfigedCategorySizes: Array[Int] = {
val keepInvalid = getHandleInvalid == BinaryEncoder.KEEP_INVALID
if (keepInvalid) {
categorySizes.map(_ + 1)
} else {
categorySizes
}
}
/**
* Main UDF for performing the conversion of nominal / binary encoded data from StringIndexer to a BinaryString format
* as a Breeze DenseVector
* @throws SparkException if the value for encoding is not within the column index during training
* (keepInvalid set to 'error')
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
@throws(classOf[SparkException])
private def encoder: UserDefinedFunction = {
val keepInvalid = getHandleInvalid == BinaryEncoder.KEEP_INVALID
val localCategorySizes = categorySizes
udf { (label: Any, colIdx: Int) =>
val origCategorySize = localCategorySizes(colIdx)
val maxCategorySize =
BinaryEncoderCommon.convertToBinaryString(Some(origCategorySize)).length
// Add additional index position to vector if the
val idxLength = if (keepInvalid) maxCategorySize + 1 else maxCategorySize
val encodedData =
BinaryEncoderCommon.convertToBinary(Some(label), idxLength)
val idx = if (encodedData.length <= origCategorySize) {
encodedData
} else {
if (keepInvalid) {
Array.fill(maxCategorySize)(1.0)
} else {
throw new SparkException(
s"The value specified for Binary Encoding (${label.toString}) in column index " +
s"$colIdx has not been seen" +
s"during training of this model. To enable reclassification of unseen values, set the handleInvalid" +
s"parameter to ${BinaryEncoder.KEEP_INVALID}"
)
}
}
Vectors.dense(idx)
}
}
/**
* Setter for specifying the column names in Array format for the columns intended to be Binary Indexed.
* @param values Array of column names
* @since 0.5.3
* @author Ben Wilson, DataBricks
*/
def setInputCols(values: Array[String]): this.type = set(inputCols, values)
/**
* Setter for specifying the desired output columns in Array format for the columns to be generated as Breeze
* DenseVectors when the model is used to transform a dataset
* @param values Array of output column names
* @note the index position relationship between setInputCols and setOutputCols is a 1 to 1 relationship. The
* positional order and length must be congruent and match.
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
def setOutputCols(values: Array[String]): this.type = set(outputCols, values)
/**
* Setter for whether to allow for unseen indexed nominal values to be used in the transformation of a dataset with
* the generated BinaryEncoderModel.
* @note Default: 'error' optional settings: 'keep' or 'error'
* @param value The setting to be used: either 'keep' or 'error'
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
/**
* Method for mutating the dataset schema to support the addition of BinaryEncoded columns
* @param schema the schema of the dataset
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
override def transformSchema(schema: StructType): StructType = {
val inputColNames = $(inputCols)
require(
inputColNames.length == categorySizes.length,
s"The number of input columns specified " +
s"(${inputColNames.length}) must be the same number of feature columns during the fit (${categorySizes.length})"
)
val keepInvalid = $(handleInvalid) == BinaryEncoder.KEEP_INVALID
val transformedSchema =
validateAndTransformSchema(schema, keepInvalid = keepInvalid)
verifyNumOfValues(transformedSchema)
}
/**
* Private method for validating that the schema metadata matches the expected cardinality for the column to be encoded
* @param schema schema of the input dataset
* @throws IllegalArgumentException if the metadata information does not match the data cardinality
* @return the schema of the dataset (pass-through method)
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
@throws(classOf[IllegalArgumentException])
private def verifyNumOfValues(schema: StructType): StructType = {
val configedSizes = getConfigedCategorySizes
$(outputCols).zipWithIndex.foreach {
case (outputColName, idx) =>
val inputColName = $(inputCols)(idx)
val attrGroup = AttributeGroup.fromStructField(schema(outputColName))
if (attrGroup.attributes.nonEmpty) {
val numCategories = configedSizes(idx)
require(
attrGroup.size == numCategories,
s"The number of distinct values in column $inputColName" +
s"was expected to be $numCategories, but the metadata shows ${attrGroup.size} distinct values."
)
}
}
schema
}
/**
* Main transformation method that will apply the model's configured encoding through a udf to the input dataset
* and add encoded columns.
* @param dataset input dataset for the model to mutate
* @return a DataFrame with added BinaryEncoded columns
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
override def transform(dataset: Dataset[_]): DataFrame = {
val transformedSchema = transformSchema(dataset.schema, logging = true)
val keepInvalid = $(handleInvalid) == BinaryEncoder.KEEP_INVALID
val encodedColumns = $(inputCols).indices.map { idx =>
val inputColName = $(inputCols)(idx)
val outputColName = $(outputCols)(idx)
val metadata = BinaryEncoderCommon
.createAttrGroupForAttrNames(
outputColName,
categorySizes(idx),
keepInvalid
)
.toMetadata()
encoder(col(inputColName).cast(DoubleType), lit(idx))
.as(outputColName, metadata)
}
dataset.withColumns($(outputCols), encodedColumns)
}
override def copy(extra: ParamMap): BinaryEncoderModel = {
val copied = new BinaryEncoderModel(uid, categorySizes)
copyValues(copied, extra).setParent(parent)
}
override def write: MLWriter = new BinaryEncoderModelWriter(this)
}
object BinaryEncoderModel extends MLReadable[BinaryEncoderModel] {
private[BinaryEncoderModel] class BinaryEncoderModelWriter(
instance: BinaryEncoderModel
) extends MLWriter {
private case class Data(categorySizes: Array[Int])
override protected def saveImpl(path: String): Unit = {
DefaultParamsWriter.saveMetadata(instance, path, sc)
val data = Data(instance.categorySizes)
val dataPath = new Path(path, "data").toString
sparkSession
.createDataFrame(Seq(data))
.repartition(1)
.write
.parquet(dataPath)
}
}
private class BinaryEncoderModelReader extends MLReader[BinaryEncoderModel] {
private val className = classOf[BinaryEncoderModel].getName
override def load(path: String): BinaryEncoderModel = {
val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
val dataPath = new Path(path, "data").toString
val data = sparkSession.read
.parquet(dataPath)
.select("categorySizes")
.head()
val categorySizes = data.getAs[Seq[Int]](0).toArray
val model = new BinaryEncoderModel(metadata.uid, categorySizes)
metadata.getAndSetParams(model)
model
}
}
override def read: MLReader[BinaryEncoderModel] = new BinaryEncoderModelReader
override def load(path: String): BinaryEncoderModel = super.load(path)
}
private[feature] object BinaryEncoderCommon {
/**
* Helper method for appropriately padding leading zero's based on the BinaryString Length value and the
* target encoding length
* @param inputString A supplied BinaryString encoded value
* @throws IllegalArgumentException if the target encoding size is smaller than the data size after encoding,
* since this would result in significant data loss and issues with modeling.
* @return Padded string to the prescribed encoding length
*/
@throws(classOf[IllegalArgumentException])
private[feature] def padZeros(inputString: String,
encodingSize: Int): String = {
val deltaLength = encodingSize - inputString.length
deltaLength match {
case 0 => inputString
case x if x > 0 => "0" * x + inputString
case _ =>
throw new IllegalArgumentException(
s"Target encoding size $encodingSize of BinaryString is less " +
s"than total encoded information. Information loss of a substantial degree would be generated. " +
s"Adjust encodingSize higher."
)
}
}
/**
* Private encoding method.
* @param value takes in an Option of Any type and casts it to a BinaryString representation
* @note Conversions from non-whole-number values of numeric types WILL incur information loss as decimal values
* cannot be represented by simple BinaryString. It will incur a rounding to the nearest whole number.
* @tparam A Any type
* @return Binary String conversion of the input data
* @throws UnsupportedOperationException if the data type being input is not of the correct type for conversion.
*/
@throws(classOf[UnsupportedOperationException])
private[feature] def convertToBinaryString[A <: Any](
value: Option[A]
): String = {
value.get match {
case a: Boolean => if (a) "1" else "0"
case a: Byte => a.toByte.toBinaryString
case a: Char => a.toChar.toBinaryString
case a: Int => a.toInt.toBinaryString
case a: Long => a.toLong.toBinaryString
case a: Float => a.toFloat.toByte.toBinaryString
case a: Double => a.toString.toDouble.toByte.toBinaryString
case a: String =>
a.toString.toCharArray
.flatMap(_.toBinaryString)
.mkString("")
case a: BigDecimal => a.toString.toByte.toBinaryString
case _ =>
throw new UnsupportedOperationException(
s"ordinalToBinary does not support type :" +
s"${value.getClass.getSimpleName}"
)
}
}
/**
* Private method for converting Any type to and Array of Doubles with appropriate prefix padding to ensure
* constant vector length of the output DenseVector from transformation.
* @param ordinalValue The value to encode
* @param encodingSize The size of the Array of Binary Double values for output
* @tparam A Any type (primitives are only supported - collections will throw an exception)
* @return Array of Doubles, representing the Binary values of the encoded value
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
private[feature] def convertToBinary[A <: Any](
ordinalValue: Option[A],
encodingSize: Int
): Array[Double] = {
val binaryString = convertToBinaryString(ordinalValue)
val padded = padZeros(binaryString, encodingSize)
binaryStringToDoubleArray(padded)
}
/**
* Private method for converting a binary string to Array[Double]
* @param binary Binary Encoded String
* @return Array[Double] for the numeric vector-representation of an encoded value in Binary format
* @since 0.5.3
* @author Ben Wilson, Databricks
*/
private[feature] def binaryStringToDoubleArray(
binary: String
): Array[Double] = {
binary.toCharArray.map(_.toString.toDouble)
}
/**
* Pulled from OneHotEncoder with slight modifications.
* @param dataset Input dataset
* @param inputColNames The names that are to be encoded
* @param outputColNames the output columns after encoding
* @return Seq[AttributeGroup] for the schema metadata associated with the Model
* @since 0.5.3
*/
def getOutputAttrGroupFromData(
dataset: Dataset[_],
inputColNames: Seq[String],
outputColNames: Seq[String]
): Seq[AttributeGroup] = {
val columns = inputColNames.map { inputColName =>
col(inputColName).cast(DoubleType)
}
val numOfColumns = columns.length
val numAttrsArray = dataset
.select(columns: _*)
.rdd
.map { row =>
(0 until numOfColumns).map(idx => row.getDouble(idx)).toArray
}
.treeAggregate(new Array[Double](numOfColumns))(
(maxValues: Array[Double], curValues: Array[Double]) => {
(0 until numOfColumns).foreach {
idx =>
val x = curValues(idx)
assert(
x <= Int.MaxValue,
s"Index out of range for maximum ordinal Value for 32bit int space. " +
s"Value: $x is larger than maximum of ${Int.MaxValue} in field ${inputColNames(idx)}"
)
assert(
x >= 0.0 && x == x.toInt,
s"Values from column ${inputColNames(idx)} must be indices, but got value $x, which is invalid."
)
maxValues(idx) = math.max(maxValues(idx), x)
}
maxValues
},
(m0, m1) => {
(0 until numOfColumns).foreach { idx =>
m0(idx) = math.max(m0(idx), m1(idx))
}
m0
}
)
.map(_.toInt + 1)
outputColNames.zip(numAttrsArray).map {
case (outputColName, numAttrs) =>
createAttrGroupForAttrNames(
outputColName,
numAttrs,
keepInvalid = false
)
}
}
/**
* Pulled from OneHotEncoder with slight modifications.
* @param outputColName Output Column to be created by the BinaryEncoderModel
* @param numAttrs The number of unique values for the output column
* @param keepInvalid Boolean flag for whether to allow for unseen values to be grouped into an unknown binary
* representation during transformation
* @return AttributeGroup
* @since 0.5.3
*/
def createAttrGroupForAttrNames(outputColName: String,
numAttrs: Int,
keepInvalid: Boolean): AttributeGroup = {
val maxAttributeSize = if (keepInvalid) {
BinaryEncoderCommon.convertToBinaryString(Some(numAttrs)).length + 1
} else {
BinaryEncoderCommon.convertToBinaryString(Some(numAttrs)).length
}
val outputAttrNames = Array.tabulate(maxAttributeSize)(_.toString)
genOutputAttrGroup(Some(outputAttrNames), outputColName)
}
/**
* Pulled from OneHotEncoder with slight modifications.
* @param outputAttrNames Attributes for the metadata
* @param outputColName The name of the output column to be added
* @return AttributeGroup
* @since 0.5.3
*/
private def genOutputAttrGroup(outputAttrNames: Option[Array[String]],
outputColName: String): AttributeGroup = {
outputAttrNames
.map { attrNames =>
val attrs: Array[Attribute] = attrNames.map { name =>
BinaryAttribute.defaultAttr.withName(name)
}
new AttributeGroup(outputColName, attrs)
}
.getOrElse {
new AttributeGroup(outputColName)
}
}
/**
* Pulled from OneHotEncoder with slight modifications.
* @param inputCol An input column to extract the encoding lengths for the output column during transformation
* @return Output names for attritubtes
*/
private def genOutputAttrNames(
inputCol: StructField
): Option[Array[String]] = {
val inputAttr = Attribute.fromStructField(inputCol)
inputAttr match {
case nominal: NominalAttribute =>
val outputCardinality =
BinaryEncoderCommon
.convertToBinaryString(Some(nominal.values.get.length))
.length
Some((0 to outputCardinality).toArray.map(_.toString))
case binary: BinaryAttribute =>
if (binary.values.isDefined) {
binary.values
} else {
Some(Array.tabulate(2)(_.toString))
}
case _: NumericAttribute =>
throw new RuntimeException(
s"The input column ${inputCol.name} cannot be continuous-value."
)
case _ =>
None // optimistic about unknown attributes
}
}
/**
* Pulled from OneHotEncoder with slight modifications.
* @param inputCol An input column that will be used to validate the output column corresponding to it.
* @param outputColName An output column to match to the input column
* @param keepInvalid invalid retention flag
* @return the Output column struct field definition
*/
def transformOutputColumnSchema(inputCol: StructField,
outputColName: String,
keepInvalid: Boolean = false): StructField = {
val outputAttrNames = genOutputAttrNames(inputCol)
val filteredOutputAttrNames = outputAttrNames.map { names =>
if (keepInvalid) {
names ++ Seq("invalidValues")
} else {
names
}
}
genOutputAttrGroup(filteredOutputAttrNames, outputColName).toStructField()
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy