
com.databricks.labs.automl.exploration.tools.PCAReducer.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automatedml_2.11 Show documentation
Show all versions of automatedml_2.11 Show documentation
Databricks Labs AutoML toolkit
The newest version!
package com.databricks.labs.automl.exploration.tools
import com.databricks.labs.automl.utils.SparkSessionWrapper
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, SparseVector}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StringType
/**
* API wrapper for conducting a 2-component PCA for visualizing a data set's feature relationships in a way that
* can be readily visualized. Provides DataFrame export types for both the raw data with PC1 and PC2 values, as well
* as the eigen vector values.
*/
class PCAReducer() extends SparkSessionWrapper {
final private val K_VALUE = 2
final val SI_NAME = "_si"
final val TEMP_LABEL_NAME = "tempLabel"
final val VECTOR_INTERNAL_NAME = "preScaledFeatures"
final val PCA_INTERNAL_ARRAY_NAME = "pcaArrayColumn"
final val PCA_DIM_1_NAME = "PCA1"
final val PCA_DIM_2_NAME = "PCA2"
final val HANDLE_MODE = "keep"
final val MINMAX_MIN = -1.0
final val MINMAX_MAX = 1.0
final val STANDARD_SCALER_MEAN = true
final val STANDARD_SCALER_STDDEV = true
final private val ALLOWABLE_SCALERS: Array[String] =
Array("minMax", "standard", "maxAbs")
final private val convertVectorToArray: Any => Array[Double] = x => {
val v = x match {
case x: SparseVector => x.toDense
case _ => x.asInstanceOf[DenseVector]
}
v.toArray
}
final private val vectorConverter = udf(convertVectorToArray)
var labelColumn: String = "label"
var featureColumn: String = "feature"
var pcaFeatureColumn: String = "pcaFeatures"
var scalerType: String = "minMax"
var reportMode: String = "original"
def setLabelColumn(value: String): this.type = {
labelColumn = value
this
}
def setFeatureColumn(value: String): this.type = {
featureColumn = value
this
}
def setPcaFeatureColumn(value: String): this.type = {
pcaFeatureColumn = value
this
}
def setScalerType(value: String): this.type = {
assert(
ALLOWABLE_SCALERS.contains(value),
s"Scaler Type $value is not supported. Must be one of ${ALLOWABLE_SCALERS.mkString(", ")}"
)
scalerType = value
this
}
def withStandardScaling: this.type = {
scalerType = "standard"
this
}
def withMinMaxScaling: this.type = {
scalerType = "minMax"
this
}
def withMaxAbsScaling: this.type = {
scalerType = "maxAbs"
this
}
def withConvertedColumns: this.type = {
reportMode = "all"
this
}
def withOriginalColumns: this.type = {
reportMode = "original"
this
}
def getLabelColumn: String = labelColumn
def getFeatureColumn: String = featureColumn
def getPcaFeatureColumn: String = pcaFeatureColumn
def getScalerType: String = scalerType
private def checkLabelColumn(data: DataFrame): Unit = {
assert(
data.schema.names.contains(labelColumn),
s"The label column specified '${labelColumn} is not present in the" +
s"schema payload: '${data.schema.names.mkString(", ")}'"
)
}
/**
* Validation and checking of the label's data structure. If non numeric (StringType), then encode the column
* with a StringIndexer
* @param data A Dataframe to analyze
* @return The original DataFrame, with StringIndexed label column if required.
* @author Ben Wilson, Databricks
* @since 0.7.2
*/
private def labelEncode(data: DataFrame): DataFrame = {
checkLabelColumn(data)
val labelType = data.schema.filter(_.name == labelColumn).head.dataType
labelType match {
case StringType => {
val indexer = new StringIndexer()
.setInputCol(labelColumn)
.setOutputCol(TEMP_LABEL_NAME)
indexer
.fit(data)
.transform(data)
.drop(labelColumn)
.withColumnRenamed(TEMP_LABEL_NAME, labelColumn)
}
case _ => data
}
}
/**
* Private method for generating the full PCA pipeline in order to generate the dimensionality reduction.
* @param data A Dataframe to analyze
* @return Pipeline object that contains all of the transformation, vectorization, and PCA reduction through SVD.
* @since 0.7.2
* @author Ben Wilson, Databricks
*/
private def createPipeline(data: DataFrame): Pipeline = {
val labelFormattedData = labelEncode(data)
val inputSchema = labelFormattedData.schema
val fullSchemaWithoutLabel = inputSchema.names.filterNot(Set(labelColumn))
val indexerCandidates =
inputSchema.filter(_.dataType == StringType).map(_.name)
val indexers = indexerCandidates
.map(x => {
new StringIndexer()
.setInputCol(x)
.setOutputCol(x + SI_NAME)
.setHandleInvalid(HANDLE_MODE)
})
.toArray
val indexerColumns = indexers.map(_.getOutputCol)
val otherColumns =
fullSchemaWithoutLabel.filterNot(indexerCandidates.toSet)
val vectorAssembler =
new VectorAssembler()
.setInputCols(otherColumns ++ indexerColumns)
.setOutputCol(VECTOR_INTERNAL_NAME)
val preScaledStages = indexers ++ Array(vectorAssembler)
val allStages = preScaledStages ++ Array(getScalerType match {
case "minMax" =>
new MinMaxScaler()
.setInputCol(VECTOR_INTERNAL_NAME)
.setOutputCol(featureColumn)
.setMin(MINMAX_MIN)
.setMax(MINMAX_MAX)
case "maxAbs" =>
new MaxAbsScaler()
.setInputCol(VECTOR_INTERNAL_NAME)
.setOutputCol(featureColumn)
case _ =>
new StandardScaler()
.setInputCol(VECTOR_INTERNAL_NAME)
.setOutputCol(featureColumn)
.setWithMean(STANDARD_SCALER_MEAN)
.setWithStd(STANDARD_SCALER_STDDEV)
}) ++ Array(
new PCA()
.setInputCol(featureColumn)
.setOutputCol(pcaFeatureColumn)
.setK(K_VALUE)
)
new Pipeline().setStages(allStages)
}
/**
* Private method for extracting the Eigen Vectors from the pc matrix
* @param pipeline Spark ml pipeline to get the vector assembler column payload
* @param pcaMatrix matrix of eigen vectors from the pca model
* @return Array[PCAEigenResult] that maps each column name to the Eigen Vectors for each prinicple component
*/
private def generateEigenValues(
pipeline: PipelineModel,
pcaMatrix: DenseMatrix
): Array[PCACEigenResult] = {
val pcaCorrelations = pcaMatrix.rowIter.map(x => x.toArray).toArray
val inputColumns =
pipeline.stages
.collect { case a: VectorAssembler => a }
.last
.getInputCols
.map {
case z if z.endsWith(SI_NAME) => z.dropRight(SI_NAME.length)
}
inputColumns
.zip(pcaCorrelations)
.map(x => PCACEigenResult(x._1, x._2.head, x._2.last))
}
private def generateEigenDataFrame(
pcaEigenData: Array[PCACEigenResult]
): DataFrame = {
spark.createDataFrame(pcaEigenData)
}
/**
* Main Method for getting a two dimensional PCA analysis for charting of the principle components in a scatter plot
* and to get the explained variances and the principle components matrix
* @param data raw dataframe to analyze
* @return PCAReducerResult, consisting of the 2 dimensional principle components of the data, the explained variances,
* and the PC matrix
* @author Ben Wilson, Databricks
* @since 0.7.2
*/
def executePipeline(data: DataFrame): PCAReducerResult = {
val originalSchema = data.schema.names
val pipeline = createPipeline(data)
val fitted = pipeline.fit(data)
val fitModel = fitted.stages.last.asInstanceOf[PCAModel]
val explainedVariances = fitModel.explainedVariance.values
val pcMatrix = generateEigenValues(fitted, fitModel.pc)
val pcaDataInitial = pipeline
.fit(data)
.transform(data)
.withColumn(
PCA_INTERNAL_ARRAY_NAME,
vectorConverter(col(pcaFeatureColumn))
)
.withColumn(PCA_DIM_1_NAME, element_at(col(PCA_INTERNAL_ARRAY_NAME), 1))
.withColumn(PCA_DIM_2_NAME, element_at(col(PCA_INTERNAL_ARRAY_NAME), 2))
val pcaData = reportMode match {
case "original" =>
pcaDataInitial.select(
originalSchema.head,
originalSchema.tail ++ Array(PCA_DIM_1_NAME, PCA_DIM_2_NAME): _*
)
case "all" => pcaDataInitial
case _ =>
throw new UnsupportedOperationException(
s"PCA report mode $reportMode is not supported."
)
}
PCAReducerResult(
pcaData,
explainedVariances,
pcMatrix,
generateEigenDataFrame(pcMatrix)
)
}
}
object PCAReducer {
def apply(labelCol: String,
featureCol: String,
pcaFeatureCol: String,
scalerType: String = "minMax"): PCAReducer =
new PCAReducer()
.setLabelColumn(labelCol)
.setFeatureColumn(featureCol)
.setPcaFeatureColumn(pcaFeatureCol)
.setScalerType(scalerType)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy