
com.databricks.labs.automl.model.NaiveBayesTuner.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automatedml_2.11 Show documentation
Show all versions of automatedml_2.11 Show documentation
Databricks Labs AutoML toolkit
The newest version!
package com.databricks.labs.automl.model
import com.databricks.labs.automl.params.{
Defaults,
NaiveBayesConfig,
NaiveBayesModelsWithResults
}
import com.databricks.labs.automl.utils.SparkSessionWrapper
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import scala.collection.mutable.ArrayBuffer
class NaiveBayesTuner(df: DataFrame)
extends SparkSessionWrapper
with Defaults
with Evolution {
//TODO: finish this some time.
// Perform a check to validate the structure and conditions of the input DataFrame to ensure that it can be modeled
validateInputDataframe(df)
private val logger: Logger = Logger.getLogger(this.getClass)
private var _scoringMetric = _scoringDefaultClassifier
private var _naiveBayesNumericBoundaries = _naiveBayesDefaultNumBoundaries
private var _naiveBayesStringBoundaries = _naiveBayesDefaultStringBoundaries
private var _classificationMetrics = classificationMetrics
private var _naiveBayesThresholds = calculateThresholds()
def setScoringMetric(value: String): this.type = {
require(
classificationMetrics.contains(value),
s"Classification scoring metric $value is not a valid member of ${invalidateSelection(value, classificationMetrics)}"
)
this._scoringMetric = value
this
}
def setNaiveBayesNumericBoundaries(
value: Map[String, (Double, Double)]
): this.type = {
this._naiveBayesNumericBoundaries = value
this
}
def setNaiveBayesStringBoundaries(
value: Map[String, List[String]]
): this.type = {
this._naiveBayesStringBoundaries = value
this
}
def getScoringMetric: String = _scoringMetric
def getNaiveBayesNumericBoundaries: Map[String, (Double, Double)] =
_naiveBayesNumericBoundaries
def getNaiveBayesStringBoundaries: Map[String, List[String]] =
_naiveBayesStringBoundaries
def getClassificationMetrics: List[String] = classificationMetrics
private def resetClassificationMetrics: List[String] =
classificationMetricValidator(
classificationAdjudicator(df),
classificationMetrics
)
private def setClassificationMetrics(value: List[String]): this.type = {
_classificationMetrics = value
this
}
private def calculateThresholds(): Array[Double] = {
val uniqueLabels = df
.select(_labelCol)
.groupBy(col(_labelCol))
.agg(count("*"))
.alias("counts")
.orderBy(col("counts").desc)
.collect()
val values = uniqueLabels.map(x => x.getAs[Double]("counts"))
val totals = values.sum
values.map(x => x / totals)
}
private def configureModel(modelConfig: NaiveBayesConfig): NaiveBayes = {
val nbModel = new NaiveBayes()
.setFeaturesCol(_featureCol)
.setLabelCol(_labelCol)
.setSmoothing(modelConfig.smoothing)
if (modelConfig.thresholds) nbModel.setThresholds(_naiveBayesThresholds)
nbModel
}
private def returnBestHyperParameters(
collection: ArrayBuffer[NaiveBayesModelsWithResults]
): (NaiveBayesConfig, Double) = {
val bestEntry = _optimizationStrategy match {
case "minimize" =>
collection.result.toArray.sortWith(_.score < _.score).head
case _ => collection.result.toArray.sortWith(_.score > _.score).head
}
(bestEntry.modelHyperParams, bestEntry.score)
}
private def evaluateStoppingScore(currentBestScore: Double,
stopThreshold: Double): Boolean = {
_optimizationStrategy match {
case "minimize" => if (currentBestScore > stopThreshold) true else false
case _ => if (currentBestScore < stopThreshold) true else false
}
}
private def evaluateBestScore(runScore: Double,
bestScore: Double): Boolean = {
_optimizationStrategy match {
case "minimize" => if (runScore < bestScore) true else false
case _ => if (runScore > bestScore) true else false
}
}
private def sortAndReturnAll(
results: ArrayBuffer[NaiveBayesModelsWithResults]
): Array[NaiveBayesModelsWithResults] = {
_optimizationStrategy match {
case "minimize" => results.result.toArray.sortWith(_.score < _.score)
case _ => results.result.toArray.sortWith(_.score > _.score)
}
}
private def sortAndReturnBestScore(
results: ArrayBuffer[NaiveBayesModelsWithResults]
): Double = {
sortAndReturnAll(results).head.score
}
private def generateThresholdedParams(
iterationCount: Int
): Array[NaiveBayesConfig] = {
val iterations = new ArrayBuffer[NaiveBayesConfig]
var i = 0
do {
val modelType =
generateRandomString("modelType", _naiveBayesStringBoundaries)
val smoothing =
generateRandomDouble("smoothing", _naiveBayesNumericBoundaries)
val thresholds = coinFlip()
iterations += NaiveBayesConfig(modelType, smoothing, thresholds)
i += 1
} while (i < iterationCount)
iterations.toArray
}
private def generateAndScoreNaiveBayes(
train: DataFrame,
test: DataFrame,
modelConfig: NaiveBayesConfig,
generation: Int = 1
): NaiveBayesModelsWithResults = {
val model = configureModel(modelConfig)
val builtModel = model.fit(train)
val predictedData = builtModel.transform(test)
val scoringMap = scala.collection.mutable.Map[String, Double]()
for (i <- _classificationMetrics) {
scoringMap(i) = classificationScoring(i, _labelCol, predictedData)
}
NaiveBayesModelsWithResults(
modelConfig,
builtModel,
scoringMap(_scoringMetric),
scoringMap.toMap,
generation
)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy