
com.databricks.labs.automl.model.Evolution.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automatedml_2.11 Show documentation
Show all versions of automatedml_2.11 Show documentation
Databricks Labs AutoML toolkit
The newest version!
package com.databricks.labs.automl.model
import com.databricks.labs.automl.model.tools.split.PerformanceSettings
import com.databricks.labs.automl.params.{
Defaults,
EvolutionDefaults,
KSampleConfig,
RandomForestConfig
}
import com.databricks.labs.automl.utils.{
DataValidation,
SeedConverters,
SparkSessionWrapper
}
import org.apache.spark.ml.evaluation.{
BinaryClassificationEvaluator,
MulticlassClassificationEvaluator,
RegressionEvaluator
}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable.ArrayBuffer
import scala.reflect.runtime.universe._
trait Evolution
extends DataValidation
with EvolutionDefaults
with SeedConverters
with SparkSessionWrapper
with Defaults {
var _labelCol: String = _defaultLabel
var _featureCol: String = _defaultFeature
var _trainPortion: Double = _defaultTrainPortion
var _trainSplitMethod: String = _defaultTrainSplitMethod
var _kSampleConfig: KSampleConfig = _defaultKSampleConfig
var _trainSplitChronologicalColumn: String =
_defaultTrainSplitChronologicalColumn
var _trainSplitChronologicalRandomPercentage: Double =
_defaultTrainSplitChronologicalRandomPercentage
var _parallelism: Int = _defaultParallelism
var _kFold: Int = _defaultKFold
var _seed: Long = _defaultSeed
var _kFoldIteratorRange: scala.collection.parallel.immutable.ParRange =
Range(0, _kFold).par
var _fieldsToIgnore = _defaultFieldsToIgnoreInVector
var _optimizationStrategy: String = _defaultOptimizationStrategy
var _firstGenerationGenePool: Int = _defaultFirstGenerationGenePool
var _numberOfMutationGenerations: Int = _defaultNumberOfMutationGenerations
var _numberOfParentsToRetain: Int = _defaultNumberOfParentsToRetain
var _numberOfMutationsPerGeneration: Int =
_defaultNumberOfMutationsPerGeneration
var _geneticMixing: Double = _defaultGeneticMixing
var _generationalMutationStrategy: String =
_defaultGenerationalMutationStrategy
var _mutationMagnitudeMode: String = _defaultMutationMagnitudeMode
var _fixedMutationValue: Int = _defaultFixedMutationValue
var _earlyStoppingScore: Double = _defaultEarlyStoppingScore
var _earlyStoppingFlag: Boolean = _defaultEarlyStoppingFlag
var _evolutionStrategy: String = _defaultEvolutionStrategy
var _geneticMBOCandidateFactor: Int = _defaultGeneticMBOCandidateFactor
var _geneticMBORegressorType: String = _defaultGeneticMBORegressorType
var _continuousEvolutionImprovementThreshold: Int =
_defaultContinuousEvolutionImprovementThreshold
var _continuousEvolutionMaxIterations: Int =
_defaultContinuousEvolutionMaxIterations
var _continuousEvolutionStoppingScore: Double =
_defaultContinuousEvolutionStoppingScore
var _continuousEvolutionParallelism: Int =
_defaultContinuousEvolutionParallelism
var _continuousEvolutionMutationAggressiveness: Int =
_defaultContinuousEvolutionMutationAggressiveness
var _continuousEvolutionGeneticMixing: Double =
_defaultContinuousEvolutionGeneticMixing
var _continuousEvolutionRollingImprovementCount: Int =
_defaultContinuousEvolutionRollingImprovementCount
var _initialGenerationMode: String = _defaultFirstGenMode
var _initialGenerationPermutationCount: Int = _defaultFirstGenPermutations
var _initialGenerationIndexMixingMode: String =
_defaultFirstGenIndexMixingMode
var _initialGenerationArraySeed: Long = _defaultFirstGenArraySeed
var _hyperSpaceModelCount: Int = _defaultHyperSpaceModelCount
var _modelSeedSet: Boolean = false
var _modelSeed: Map[String, Any] = Map.empty
var _dataReduce: Double = _defaultDataReduce
var _syntheticCol: String = _defaultKSampleConfig.syntheticCol
var _kGroups: Int = _defaultKSampleConfig.kGroups
var _kMeansMaxIter: Int = _defaultKSampleConfig.kMeansMaxIter
var _kMeansTolerance: Double = _defaultKSampleConfig.kMeansTolerance
var _kMeansDistanceMeasurement: String =
_defaultKSampleConfig.kMeansDistanceMeasurement
var _kMeansSeed: Long = _defaultKSampleConfig.kMeansSeed
var _kMeansPredictionCol: String = _defaultKSampleConfig.kMeansPredictionCol
var _lshHashTables: Int = _defaultKSampleConfig.lshHashTables
var _lshSeed: Long = _defaultKSampleConfig.lshSeed
var _lshOutputCol: String = _defaultKSampleConfig.lshOutputCol
var _quorumCount: Int = _defaultKSampleConfig.quorumCount
var _minimumVectorCountToMutate: Int =
_defaultKSampleConfig.minimumVectorCountToMutate
var _vectorMutationMethod: String = _defaultKSampleConfig.vectorMutationMethod
var _mutationMode: String = _defaultKSampleConfig.mutationMode
var _mutationValue: Double = _defaultKSampleConfig.mutationValue
var _labelBalanceMode: String = _defaultKSampleConfig.labelBalanceMode
var _cardinalityThreshold: Int = _defaultKSampleConfig.cardinalityThreshold
var _numericRatio: Double = _defaultKSampleConfig.numericRatio
var _numericTarget: Int = _defaultKSampleConfig.numericTarget
var _randomizer: scala.util.Random = scala.util.Random
_randomizer.setSeed(_seed)
def setLabelCol(value: String): this.type = {
_labelCol = value
this
}
def setFeaturesCol(value: String): this.type = {
_featureCol = value
this
}
def setFieldsToIgnore(value: Array[String]): this.type = {
_fieldsToIgnore = value
this
}
def setTrainPortion(value: Double): this.type = {
require(
value < 1.0 & value > 0.0,
"Training portion must be in the range > 0 and < 1"
)
_trainPortion = value
this
}
def setTrainSplitMethod(value: String): this.type = {
require(
allowableTrainSplitMethod.contains(value),
s"TrainSplitMethod $value must be one of: ${allowableTrainSplitMethod.mkString(", ")}"
)
_trainSplitMethod = value
this
}
/**
* Setter - for setting the name of the Synthetic column name
* @param value String: A column name that is uniquely not part of the main DataFrame
* @since 0.5.1
* @author Ben Wilson
*/
def setSyntheticCol(value: String): this.type = {
_syntheticCol = value
this
}
/**
* Setter for specifying the number of K-Groups to generate in the KMeans model
* @param value Int: number of k groups to generate
* @return this
*/
def setKGroups(value: Int): this.type = {
_kGroups = value
this
}
/**
* Setter for specifying the maximum number of iterations for the KMeans model to go through to converge
* @param value Int: Maximum limit on iterations
* @return this
*/
def setKMeansMaxIter(value: Int): this.type = {
_kMeansMaxIter = value
this
}
/**
* Setter for Setting the tolerance for KMeans (must be >0)
* @param value The tolerance value setting for KMeans
* @see reference: [[http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.clustering.KMeans]]
* for further details.
* @return this
* @throws IllegalArgumentException() if a value less than 0 is entered
*/
@throws(classOf[IllegalArgumentException])
def setKMeansTolerance(value: Double): this.type = {
require(
value > 0,
s"KMeans tolerance value ${value.toString} is out of range. Must be > 0."
)
_kMeansTolerance = value
this
}
/**
* Setter for which distance measurement to use to calculate the nearness of vectors to a centroid
* @param value String: Options -> "euclidean" or "cosine" Default: "euclidean"
* @return this
* @throws IllegalArgumentException() if an invalid value is entered
*/
@throws(classOf[IllegalArgumentException])
def setKMeansDistanceMeasurement(value: String): this.type = {
require(
allowableKMeansDistanceMeasurements.contains(value),
s"Kmeans Distance Measurement $value is not " +
s"a valid mode of operation. Must be one of: ${allowableKMeansDistanceMeasurements.mkString(", ")}"
)
_kMeansDistanceMeasurement = value
this
}
/**
* Setter for a KMeans seed for the clustering algorithm
* @param value Long: Seed value
* @return this
*/
def setKMeansSeed(value: Long): this.type = {
_kMeansSeed = value
this
}
/**
* Setter for the internal KMeans column for cluster membership attribution
* @param value String: column name for internal algorithm column for group membership
* @return this
*/
def setKMeansPredictionCol(value: String): this.type = {
_kMeansPredictionCol = value
this
}
/**
* Setter for Configuring the number of Hash Tables to use for MinHashLSH
* @param value Int: Count of hash tables to use
* @see [[http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.MinHashLSH]]
* for more information
* @return this
*/
def setLSHHashTables(value: Int): this.type = {
_lshHashTables = value
this
}
/**
* Setter for the LSH Seed for the model
* @param value Long: Seed value
* @return this
*/
def setLSHSeed(value: Long): this.type = {
_lshSeed = value
this
}
/**
* Setter for the internal LSH output hash information column
* @param value String: column name for the internal MinHashLSH Model transformation value
* @return this
*/
def setLSHOutputCol(value: String): this.type = {
_lshOutputCol = value
this
}
/**
* Setter for how many vectors to find in adjacency to the centroid for generation of synthetic data
* @note the higher the value set here, the higher the variance in synthetic data generation
* @param value Int: Number of vectors to find nearest each centroid within the class
* @return this
*/
def setQuorumCount(value: Int): this.type = {
_quorumCount = value
this
}
/**
* Setter for minimum threshold for vector indexes to mutate within the feature vector.
* @note In vectorMutationMethod "fixed" this sets the fixed count of how many vector positions to mutate.
* In vectorMutationMethod "random" this sets the lower threshold for 'at least this many indexes will
* be mutated'
* @param value The minimum (or fixed) number of indexes to mutate.
* @return this
*/
def setMinimumVectorCountToMutate(value: Int): this.type = {
_minimumVectorCountToMutate = value
this
}
/**
* Setter for the Vector Mutation Method
* @note Options:
* "fixed" - will use the value of minimumVectorCountToMutate to select random indexes of this number of indexes.
* "random" - will use this number as a lower bound on a random selection of indexes between this and the vector length.
* "all" - will mutate all of the vectors.
* @param value String - the mode to use.
* @return this
* @throws IllegalArgumentException() if the mode is not supported.
*/
@throws(classOf[IllegalArgumentException])
def setVectorMutationMethod(value: String): this.type = {
require(
allowableVectorMutationMethods.contains(value),
s"Vector Mutation Mode $value is not supported. " +
s"Must be one of: ${allowableVectorMutationMethods.mkString(", ")} "
)
_vectorMutationMethod = value
this
}
/**
* Setter for the Mutation Mode of the feature vector individual values
* @note Options:
* "weighted" - uses weighted averaging to scale the euclidean distance between the centroid vector and mutation candidate vectors
* "random" - randomly selects a position on the euclidean vector between the centroid vector and the candidate mutation vectors
* "ratio" - uses a ratio between the values of the centroid vector and the mutation vector *
* @param value String: the mode to use.
* @return this
* @throws IllegalArgumentException() if the mode is not supported.
*/
@throws(classOf[IllegalArgumentException])
def setMutationMode(value: String): this.type = {
require(
allowableMutationModes.contains(value),
s"Mutation Mode $value is not a valid mode of operation. " +
s"Must be one of: ${allowableMutationModes.mkString(", ")}"
)
_mutationMode = value
this
}
/**
* Setter for specifying the mutation magnitude for the modes 'weighted' and 'ratio' in mutationMode
* @param value Double: value between 0 and 1 for mutation magnitude adjustment.
* @note the higher this value, the closer to the centroid vector vs. the candidate mutation vector the synthetic row data will be.
* @return this
* @throws IllegalArgumentException() if the value specified is outside of the range (0, 1)
*/
@throws(classOf[IllegalArgumentException])
def setMutationValue(value: Double): this.type = {
require(
value > 0 & value < 1,
s"Mutation Value must be between 0 and 1. Value $value is not permitted."
)
_mutationValue = value
this
}
/**
* Setter - for determining the label balance approach mode.
* @note Available modes:
* 'match': Will match all smaller class counts to largest class count. [WARNING] - May significantly increase memory pressure!
* 'percentage' Will adjust smaller classes to a percentage value of the largest class count.
* 'target' Will increase smaller class counts to a fixed numeric target of rows.
* @param value String: one of: 'match', 'percentage' or 'target'
* @note Default: "percentage"
* @since 0.5.1
* @author Ben Wilson
* @throws UnsupportedOperationException() if the provided mode is not supported.
*/
@throws(classOf[UnsupportedOperationException])
def setLabelBalanceMode(value: String): this.type = {
require(
allowableLabelBalanceModes.contains(value),
s"Label Balance Mode $value is not supported." +
s"Must be one of: ${allowableLabelBalanceModes.mkString(", ")}"
)
_labelBalanceMode = value
this
}
/**
* Setter - for overriding the cardinality threshold exception threshold. [WARNING] increasing this value on
* a sufficiently large data set could incur, during runtime, excessive memory and cpu pressure on the cluster.
* @param value Int: the limit above which an exception will be thrown for a classification problem wherein the
* label distinct count is too large to successfully generate synthetic data.
* @note Default: 20
* @since 0.5.1
* @author Ben Wilson
*/
def setCardinalityThreshold(value: Int): this.type = {
_cardinalityThreshold = value
this
}
/**
* Setter - for specifying the percentage ratio for the mode 'percentage' in setLabelBalanceMode()
* @param value Double: A fractional double in the range of 0.0 to 1.0.
* @note Setting this value to 1.0 is equivalent to setting the label balance mode to 'match'
* @note Default: 0.2
* @since 0.5.1
* @author Ben Wilson
* @throws UnsupportedOperationException() if the provided value is outside of the range of 0.0 -> 1.0
*/
@throws(classOf[UnsupportedOperationException])
def setNumericRatio(value: Double): this.type = {
require(
value <= 1.0 & value > 0.0,
s"Invalid Numeric Ratio entered! Must be between 0 and 1." +
s"${value.toString} is not valid."
)
_numericRatio = value
this
}
/**
* Setter - for specifying the target row count to generate for 'target' mode in setLabelBalanceMode()
* @param value Int: The desired final number of rows per minority class label
* @note [WARNING] Setting this value to too high of a number will greatly increase runtime and memory pressure.
* @since 0.5.1
* @author Ben Wilson
*/
def setNumericTarget(value: Int): this.type = {
_numericTarget = value
this
}
def setTrainSplitChronologicalColumn(value: String): this.type = {
_trainSplitChronologicalColumn = value
this
}
def setTrainSplitChronologicalRandomPercentage(value: Double): this.type = {
_trainSplitChronologicalRandomPercentage = value
if (value > 10)
println(
"[WARNING] setTrainSplitChronologicalRandomPercentage() setting this value above 10 " +
"percent will cause significant per-run train/test skew and variability in row counts during training. " +
"Use higher values only if this is desired."
)
this
}
def setParallelism(value: Int): this.type = {
require(
_parallelism < 10000,
s"Parallelism above 10000 will result in cluster instability."
)
_parallelism = value
this
}
def setKFold(value: Int): this.type = {
_kFold = value
_kFoldIteratorRange = Range(0, _kFold).par
this
}
def setSeed(value: Long): this.type = {
_seed = value
this
}
def setOptimizationStrategy(value: String): this.type = {
val valueLC = value.toLowerCase
require(
allowableOptimizationStrategies.contains(valueLC),
s"Optimization Strategy '$valueLC' is not a member of ${invalidateSelection(valueLC, allowableOptimizationStrategies)}"
)
_optimizationStrategy = valueLC
this
}
def setFirstGenerationGenePool(value: Int): this.type = {
require(
value >= 5,
s"Values less than 5 for firstGenerationGenePool will require excessive generational mutation to converge"
)
_firstGenerationGenePool = value
this
}
def setNumberOfMutationGenerations(value: Int): this.type = {
require(value > 0, s"Number of Generations must be greater than 0")
_numberOfMutationGenerations = value
this
}
def setNumberOfParentsToRetain(value: Int): this.type = {
require(
value > 0,
s"Number of Parents must be greater than 0. '$value' is not a valid number."
)
_numberOfParentsToRetain = value
this
}
def setNumberOfMutationsPerGeneration(value: Int): this.type = {
require(
value > 0,
s"Number of Mutations per generation must be greater than 0. '$value' is not a valid number."
)
_numberOfMutationsPerGeneration = value
this
}
def setGeneticMixing(value: Double): this.type = {
require(
value < 1.0 & value > 0.0,
s"Mutation Aggressiveness must be in range (0,1). Current Setting of $value is not permitted."
)
_geneticMixing = value
this
}
def setGenerationalMutationStrategy(value: String): this.type = {
val valueLC = value.toLowerCase
require(
allowableMutationStrategies.contains(valueLC),
s"Generational Mutation Strategy '$valueLC' is not a member of ${invalidateSelection(valueLC, allowableMutationStrategies)}"
)
_generationalMutationStrategy = valueLC
this
}
def setMutationMagnitudeMode(value: String): this.type = {
val valueLC = value.toLowerCase
require(
allowableMutationMagnitudeMode.contains(valueLC),
s"Mutation Magnitude Mode '$valueLC' is not a member of ${invalidateSelection(valueLC, allowableMutationMagnitudeMode)}"
)
_mutationMagnitudeMode = valueLC
this
}
def setFixedMutationValue(value: Int): this.type = {
val maxMutationCount = modelConfigLength[RandomForestConfig]
require(
value <= maxMutationCount,
s"Mutation count '$value' cannot exceed number of hyperparameters ($maxMutationCount)"
)
require(value > 0, s"Mutation count '$value' must be greater than 0")
_fixedMutationValue = value
this
}
def setEarlyStoppingScore(value: Double): this.type = {
_earlyStoppingScore = value
this
}
def setEarlyStoppingFlag(value: Boolean): this.type = {
_earlyStoppingFlag = value
this
}
def setEvolutionStrategy(value: String): this.type = {
require(
allowableEvolutionStrategies.contains(value),
s"Evolution Strategy '$value' is not a supported mode. Must be one of: ${invalidateSelection(value, allowableEvolutionStrategies)}"
)
_evolutionStrategy = value
this
}
/**
* Setter for defining the secondary stopping criteria for continuous training mode ( number of consistentlt
* not-improving runs to terminate the learning algorithm due to diminishing returns.
* @param value Negative Integer (an improvement to a priori will reset the counter and subsequent non-improvements
* will decrement a mutable counter. If the counter hits this limit specified in value, the continuous
* mode algorithm will stop).
* @author Ben Wilson, Databricks
* @since 0.6.0
* @throws IllegalArgumentException if the value is positive.
*/
@throws(classOf[IllegalArgumentException])
def setContinuousEvolutionImprovementThreshold(value: Int): this.type = {
require(
value < 0,
s"ContinuousEvolutionImprovementThreshold must be less than zero. It is " +
s"recommended to set this value to less than -4."
)
_continuousEvolutionImprovementThreshold = value
this
}
/**
* Setter for selecting the type of Regressor to use for the within-epoch generation MBO of candidates
* @param value String - one of "XGBoost", "LinearRegression" or "RandomForest"
* @author Ben Wilson, Databricks
* @since 0.6.0
* @throws IllegalArgumentException if the value is not supported
*/
@throws(classOf[IllegalArgumentException])
def setGeneticMBORegressorType(value: String): this.type = {
require(
allowableMBORegressorTypes.contains(value),
s"GeneticRegressorType $value is not a supported Regressor " +
s"Type. Must be one of: ${allowableMBORegressorTypes.mkString(", ")}"
)
_geneticMBORegressorType = value
this
}
/**
* Setter for defining the factor to be applied to the candidate listing of hyperparameters to generate through
* mutation for each generation other than the initial and post-modeling optimization phases. The larger this
* value (default: 10), the more potential space can be searched. There is not a large performance hit to this,
* and as such, values in excess of 100 are viable.
* @param value Int - a factor to multiply the numberOfMutationsPerGeneration by to generate a count of potential
* candidates.
* @author Ben Wilson, Databricks
* @since 0.6.0
* @throws IllegalArgumentException if the value is not greater than zero.
*/
@throws(classOf[IllegalArgumentException])
def setGeneticMBOCandidateFactor(value: Int): this.type = {
require(value > 0, s"GeneticMBOCandidateFactor must be greater than zero.")
_geneticMBOCandidateFactor = value
this
}
def setContinuousEvolutionMaxIterations(value: Int): this.type = {
if (value > 500)
println(
s"[WARNING] Total Modeling count $value is higher than recommended limit of 500. " +
s"This tuning will take a long time to run."
)
_continuousEvolutionMaxIterations = value
this
}
def setContinuousEvolutionStoppingScore(value: Double): this.type = {
_continuousEvolutionStoppingScore = value
this
}
def setContinuousEvolutionParallelism(value: Int): this.type = {
if (value > 10)
println(
s"[WARNING] ContinuousEvolutionParallelism -> $value is higher than recommended " +
s"concurrency for efficient optimization for convergence." +
s"\n Setting this value below 11 will converge faster in most cases."
)
_continuousEvolutionParallelism = value
this
}
def setContinuousEvolutionMutationAggressiveness(value: Int): this.type = {
if (value > 4)
println(
s"[WARNING] ContinuousEvolutionMutationAggressiveness -> $value. " +
s"\n Setting this higher than 4 will result in extensive random search and will take longer to converge " +
s"to optimal hyperparameters."
)
_continuousEvolutionMutationAggressiveness = value
this
}
def setContinuousEvolutionGeneticMixing(value: Double): this.type = {
require(
value < 1.0 & value > 0.0,
s"Mutation Aggressiveness must be in range (0,1). Current Setting of $value is not permitted."
)
_continuousEvolutionGeneticMixing = value
this
}
def setContinuousEvolutionRollingImporvementCount(value: Int): this.type = {
require(
value > 0,
s"ContinuousEvolutionRollingImprovementCount must be > 0. $value is invalid."
)
if (value < 10)
println(
s"[WARNING] ContinuousEvolutionRollingImprovementCount -> $value setting is low. " +
s"Optimal Convergence may not occur due to early stopping."
)
_continuousEvolutionRollingImprovementCount = value
this
}
def setModelSeed(value: Map[String, Any]): this.type = {
_modelSeed = value
_modelSeedSet = true
this
}
def setDataReductionFactor(value: Double): this.type = {
require(value > 0, s"Data Reduction Factor must be between 0 and 1")
require(value < 1, s"Data Reduction Factor must be between 0 and 1")
_dataReduce = value
this
}
def setFirstGenMode(value: String): this.type = {
require(
allowableInitialGenerationModes.contains(value),
s"First Generation Mode '$value' is not a supported mode." +
s" Must be one of: ${invalidateSelection(value, allowableInitialGenerationModes)}"
)
_initialGenerationMode = value
this
}
def setFirstGenPermutations(value: Int): this.type = {
_initialGenerationPermutationCount = value
this
}
def setHyperSpaceModelCount(value: Int): this.type = {
_hyperSpaceModelCount = value
this
}
def setFirstGenIndexMixingMode(value: String): this.type = {
require(
allowableInitialGenerationIndexMixingModes.contains(value),
s"First Generation Mode '$value' is not a" +
s"supported mode. Must be one of ${invalidateSelection(value, allowableInitialGenerationIndexMixingModes)}"
)
_initialGenerationIndexMixingMode = value
this
}
def setFirstGenArraySeed(value: Long): this.type = {
_initialGenerationArraySeed = value
this
}
def getFirstGenArraySeed: Long = _initialGenerationArraySeed
def getFirstGenIndexMixingMode: String = _initialGenerationIndexMixingMode
def getFirstGenPermutations: Int = _initialGenerationPermutationCount
def getFirstGenMode: String = _initialGenerationMode
def getHyperSpaceModelCount: Int = _hyperSpaceModelCount
def getLabelCol: String = _labelCol
def getFeaturesCol: String = _featureCol
def getFieldsToIgnore: Array[String] = _fieldsToIgnore
def getTrainPortion: Double = _trainPortion
def getTrainSplitMethod: String = _trainSplitMethod
def getTrainSplitChronologicalColumn: String = _trainSplitChronologicalColumn
def getTrainSplitChronologicalRandomPercentage: Double =
_trainSplitChronologicalRandomPercentage
def getParallelism: Int = _parallelism
def getKFold: Int = _kFold
def getSeed: Long = _seed
def getOptimizationStrategy: String = _optimizationStrategy
def getFirstGenerationGenePool: Int = _firstGenerationGenePool
def getNumberOfMutationGenerations: Int = _numberOfMutationGenerations
def getNumberOfParentsToRetain: Int = _numberOfParentsToRetain
def getNumberOfMutationsPerGeneration: Int = _numberOfMutationsPerGeneration
def getGeneticMixing: Double = _geneticMixing
def getGenerationalMutationStrategy: String = _generationalMutationStrategy
def getMutationMagnitudeMode: String = _mutationMagnitudeMode
def getFixedMutationValue: Int = _fixedMutationValue
def getEarlyStoppingScore: Double = _earlyStoppingScore
def getEarlyStoppingFlag: Boolean = _earlyStoppingFlag
def getEvolutionStrategy: String = _evolutionStrategy
def getGeneticMBORegressorType: String = _geneticMBORegressorType
def getGeneticMBOCandidateFactor: Int = _geneticMBOCandidateFactor
def getContinuousEvolutionImprovementThreshold: Int =
_continuousEvolutionImprovementThreshold
def getContinuousEvolutionMaxIterations: Int =
_continuousEvolutionMaxIterations
def getContinuousEvolutionStoppingScore: Double =
_continuousEvolutionStoppingScore
def getContinuousEvolutionParallelism: Int = _continuousEvolutionParallelism
def getContinuousEvolutionMutationAggressiveness: Int =
_continuousEvolutionMutationAggressiveness
def getContinuousEvolutionGeneticMixing: Double =
_continuousEvolutionGeneticMixing
def getContinuousEvolutionRollingImporvementCount: Int =
_continuousEvolutionRollingImprovementCount
def getModelSeed: Map[String, Any] = _modelSeed
def getDataReductionFactor: Double = _dataReduce
// DEBUG for logging purposes of configurations.
def debugSettings: String = {
s"DEBUG: \n Evolution.scala --> xgbWorkers: ${PerformanceSettings.xgbWorkers(_parallelism)} \n " +
s"Evolution.scala --> totalCores: ${PerformanceSettings.totalCores} \n " +
s"Evolution.scala --> _parallelism: ${_parallelism} \n " +
s"Evolution.scala --> getParallelism: ${getParallelism} \n " +
s"Evolution.scala --> optimalJVMModelPartitions: ${PerformanceSettings
.optimalJVMModelPartitions(_parallelism)} \n " +
s"Evolution.scala --> parTasks: ${PerformanceSettings.parTasks}"
}
/**
* Internal method for validating if a numeric mapping that is specified contains any invalid keys
* @param standardConfig The static defined numeric mapping for a model type
* @param modConfig a user-specified mapping override
* @since 0.6.1
* @author Ben Wilson, Databricks
* @throws IllegalArgumentException if the key is invalid for the model type specified.
*/
@throws(classOf[IllegalArgumentException])
protected[model] def validateNumericMapping(
standardConfig: Map[String, (Double, Double)],
modConfig: Map[String, (Double, Double)]
): Unit = {
val staticKeys = standardConfig.keys.toArray
val modKeys = modConfig.keys.toArray
modKeys.foreach(
x =>
if (!staticKeys.contains(x))
throw new IllegalArgumentException(
s"The numeric Boundary map key " +
s"supplied: [$x] is not a valid member of Numeric Mapping. " +
s"\nKeys are restricted to: [${staticKeys.mkString(", ")}]"
)
)
}
/**
* Internal method for validating if a string mapping that is specified contains any invalid keys
* @param standardConfig The static defined string mapping for a model type
* @param modConfig a user-specified mapping override
* @since 0.6.1
* @author Ben Wilson, Databricks
* @throws IllegalArgumentException if the key is invalid for the model type specified.
*/
@throws(classOf[IllegalArgumentException])
protected[model] def validateStringMapping(
standardConfig: Map[String, List[String]],
modConfig: Map[String, List[String]]
): Unit = {
val staticKeys = standardConfig.keys.toArray
val modKeys = modConfig.keys.toArray
modKeys.foreach(
x =>
if (!staticKeys.contains(x))
throw new IllegalArgumentException(
s"The string Boundary map key " +
s"supplied: [$x] is not a valid member of String Mapping. " +
s"\nKeys are restricted to: [${staticKeys.mkString(", ")}]"
)
)
}
/**
* Helper function for partially updating a numeric mapping
* @param defaultMap The default configuration Map for a numeric mapping for model hyperparameter search space
* @param updateMap user-supplied updated map (doesn't have to have all elements in it)
* @return The default map, updated with the user-supplied overrides
* @since 0.6.1
* @author Ben Wilson, Jas Bali Databricks
*/
def partialOverrideNumericMapping(
defaultMap: Map[String, (Double, Double)],
updateMap: Map[String, (Double, Double)]
): Map[String, (Double, Double)] = {
defaultMap ++ updateMap
}
/**
* Helper function for partially updating a string mapping
*
* @param defaultMap The default configuration Map for a string mapping for model hyperparameter search space
* @param updateMap user-supplied updated map (doesn't have to have all elements in it)
* @return The default map, updated with the user-supplied overrides
* @since 0.6.1
* @author Ben Wilson, Jas Bali Databricks
*/
def partialOverrideStringMapping(
defaultMap: Map[String, List[String]],
updateMap: Map[String, List[String]]
): Map[String, List[String]] = {
defaultMap ++ updateMap
}
// TODO - Calculation should take into account early stopping
def totalModels: Int = _evolutionStrategy match {
case "batch" =>
(_numberOfMutationsPerGeneration * _numberOfMutationGenerations) + _firstGenerationGenePool +
_initialGenerationPermutationCount + _hyperSpaceModelCount
case "continuous" =>
_continuousEvolutionMaxIterations - _continuousEvolutionParallelism + _firstGenerationGenePool
case _ =>
throw new MatchError(
s"EvolutionStrategy mode ${_evolutionStrategy} is not supported." +
s"\n Choose one of: ${allowableEvolutionStrategies.mkString(", ")}"
)
}
def modelConfigLength[T: TypeTag]: Int = {
typeOf[T].members
.collect {
case m: MethodSymbol if m.isCaseAccessor => m
}
.toList
.length
}
def extractBoundaryDouble(
param: String,
boundaryMap: Map[String, (AnyVal, AnyVal)]
): (Double, Double) = {
val minimum = boundaryMap(param)._1.asInstanceOf[Double]
val maximum = boundaryMap(param)._2.asInstanceOf[Double]
(minimum, maximum)
}
def extractBoundaryInteger(
param: String,
boundaryMap: Map[String, (AnyVal, AnyVal)]
): (Int, Int) = {
val minimum = boundaryMap(param)._1.asInstanceOf[Double].toInt
val maximum = boundaryMap(param)._2.asInstanceOf[Double].toInt
(minimum, maximum)
}
def generateRandomDouble(
param: String,
boundaryMap: Map[String, (AnyVal, AnyVal)]
): Double = {
val (minimumValue, maximumValue) = extractBoundaryDouble(param, boundaryMap)
minimumValue + _randomizer.nextDouble() * (maximumValue - minimumValue)
}
def generateRandomInteger(param: String,
boundaryMap: Map[String, (AnyVal, AnyVal)]): Int = {
val (minimumValue, maximumValue) =
extractBoundaryInteger(param, boundaryMap)
_randomizer.nextInt(maximumValue - minimumValue) + minimumValue
}
def generateRandomString(param: String,
boundaryMap: Map[String, List[String]]): String = {
_randomizer.shuffle(boundaryMap(param)).head
}
def coinFlip(): Boolean = {
math.random < 0.5
}
def coinFlip(parent: Boolean, child: Boolean, p: Double): Boolean = {
if (math.random < p) parent else child
}
def buildLayerArray(inputFeatureSize: Int,
distinctClasses: Int,
nLayers: Int,
hiddenLayerSizeAdjust: Int): Array[Int] = {
val layerConstruct = new ArrayBuffer[Int]
layerConstruct += inputFeatureSize
(1 to nLayers).foreach { x =>
layerConstruct += inputFeatureSize + nLayers - x + hiddenLayerSizeAdjust
}
layerConstruct += distinctClasses
layerConstruct.result.toArray
}
def generateLayerArray(layerParam: String,
layerSizeParam: String,
boundaryMap: Map[String, (AnyVal, AnyVal)],
inputFeatureSize: Int,
distinctClasses: Int): Array[Int] = {
val layersToGenerate = generateRandomInteger(layerParam, boundaryMap)
val hiddenLayerSizeAdjust =
generateRandomInteger(layerSizeParam, boundaryMap)
buildLayerArray(
inputFeatureSize,
distinctClasses,
layersToGenerate,
hiddenLayerSizeAdjust
)
}
def getRandomIndeces(minimum: Int,
maximum: Int,
parameterCount: Int): List[Int] = {
val fullIndexArray = List.range(0, maximum)
val randomSeed = new scala.util.Random
val count = minimum + randomSeed.nextInt((parameterCount - minimum) + 1)
val adjCount = if (count < 1) 1 else count
val shuffledArray = scala.util.Random.shuffle(fullIndexArray).take(adjCount)
shuffledArray.sortWith(_ < _)
}
def getFixedIndeces(minimum: Int,
maximum: Int,
parameterCount: Int): List[Int] = {
val fullIndexArray = List.range(0, maximum)
val randomSeed = new scala.util.Random
randomSeed.shuffle(fullIndexArray).take(parameterCount).sortWith(_ < _)
}
def generateMutationIndeces(minimum: Int,
maximum: Int,
parameterCount: Int,
mutationCount: Int): Array[List[Int]] = {
val mutations = new ArrayBuffer[List[Int]]
for (_ <- 0 to mutationCount) {
_mutationMagnitudeMode match {
case "random" =>
mutations += getRandomIndeces(minimum, maximum, parameterCount)
case "fixed" =>
mutations += getFixedIndeces(minimum, maximum, parameterCount)
case _ =>
new UnsupportedOperationException(
s"Unsupported mutationMagnitudeMode ${_mutationMagnitudeMode}"
)
}
}
mutations.result.toArray
}
def geneMixing(parent: Double,
child: Double,
parentMutationPercentage: Double): Double = {
(parent * parentMutationPercentage) + (child * (1 - parentMutationPercentage))
}
def geneMixing(parent: Int,
child: Int,
parentMutationPercentage: Double): Int = {
((parent * parentMutationPercentage) + (child * (1 - parentMutationPercentage))).toInt
}
def geneMixing(parent: String, child: String): String = {
val mixed = new ArrayBuffer[String]
mixed += parent += child
scala.util.Random.shuffle(mixed.toList).head
}
def geneMixing(parent: Array[Int],
child: Array[Int],
parentMutationPercentage: Double): Array[Int] = {
val staticStart = parent.head
val staticEnd = parent.last
val parentHiddenLayers = parent.length - 2
val childHiddenLayers = child.length - 2
val parentMagnitude = parent(1) - staticStart
val childMagnidue = child(1) - staticStart
val hiddenLayerMix = geneMixing(
parentHiddenLayers,
childHiddenLayers,
parentMutationPercentage
)
val sizeAdjustMix =
geneMixing(parentMagnitude, childMagnidue, parentMutationPercentage)
buildLayerArray(staticStart, staticEnd, hiddenLayerMix, sizeAdjustMix)
}
/**
* Method for calculating the remaining time left on the genetic algorithm training (roughly)
* @note Due to the asynchronous nature of the algorithm, the times are not exact and are a reflection of time
* since the creation of the Futures and when they were initially inserted into the thread pool.
* @param currentGen The current Generation that the model is running on
* @param currentModel The index of the current model that is being run.
* @return A Double representing the total completion percentage of the modeling portion of the run.
* @since 0.2.1
* @author Ben Wilson
*/
def calculateModelingFamilyRemainingTime(currentGen: Int,
currentModel: Int): Double = {
val modelsComplete = _evolutionStrategy match {
case "batch" =>
if (currentGen == 1) {
currentModel
} else {
_firstGenerationGenePool + (_numberOfMutationsPerGeneration * (currentGen - 2) + currentModel)
}
case _ => currentGen + _firstGenerationGenePool
}
(modelsComplete.toDouble / totalModels.toDouble) * 100
}
/**
* Method for validating the distinct class count for a classification type model (for use in determining which
* evaluator to employ for scoring and optimization of each model)
* @param df source Dataframe (prior to splitting for train/test)
* @return Boolean true for Binary Classification problem, false for multi-class problem
* @since 0.4.0
* @author Ben Wilson
*/
def classificationAdjudicator(df: DataFrame): Boolean = {
// Calculate the distinct entries of the label value for a classification problem
val uniqueLabelCounts = df.select(_labelCol).distinct().count()
if (uniqueLabelCounts <= 2) true else false
}
/**
* Method for restricting the available metrics used or are available for optimizing for classification problems
* @param binaryValidation boolean check from classificationAdjudicator() method
* @param metricPayload the hard-coded allowable List[String] of allowable classification metrics
* from com.databricks.labs.automl.params.EvolutionDefaults
* @return a copy of the the allowable params list with the Binary metrics removed if this is a multiclass problem.
* @since 0.4.0
* @author Ben Wilson
*/
def classificationMetricValidator(
binaryValidation: Boolean,
metricPayload: List[String]
): List[String] = {
if (binaryValidation) {
metricPayload
} else {
metricPayload.diff(List("areaUnderROC", "areaUnderPR"))
}
}
/**
* Method for scoring and evaluating classification models (supporting both multi-class and binary classification
* problems)
* @param metricName the metric to be tested against (both for binary and multi-class)
* @param labelColumn the column name in the data set that is the 'source of truth' to compare against
* @param data the DataFrame that has been transformed
* @return the score, as a Double value.
* @since 0.4.0
* @author Ben Wilson
*/
def classificationScoring(metricName: String,
labelColumn: String,
data: DataFrame): Double = {
metricName match {
case "areaUnderPR" | "areaUnderROC" =>
new BinaryClassificationEvaluator()
.setLabelCol(labelColumn)
.setRawPredictionCol("probability")
.setMetricName(metricName)
.evaluate(data)
case _ =>
new MulticlassClassificationEvaluator()
.setLabelCol(labelColumn)
.setPredictionCol("prediction")
.setMetricName(metricName)
.evaluate(data)
}
}
/**
* Method for scoring Regression models.
* @param metricName The metric desired to be tested
* @param labelColumn The name of the label column
* @param data the DataFrame that has been transformed by a model.
* @return the score for the metric, as a Double value.
* @since 0.4.0
* @author Ben Wilson
*/
def regressionScoring(metricName: String,
labelColumn: String,
data: DataFrame): Double = {
new RegressionEvaluator()
.setLabelCol(labelColumn)
.setMetricName(metricName)
.evaluate(data)
}
def generateAggressiveness(totalConfigs: Int, currentIteration: Int): Int = {
val mutationAggressiveness = _generationalMutationStrategy match {
case "linear" =>
if (totalConfigs - (currentIteration + 1) < 1) 1
else
totalConfigs - (currentIteration + 1)
case _ => _fixedMutationValue
}
mutationAggressiveness
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy