All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.databricks.labs.automl.model.Evolution.scala Maven / Gradle / Ivy

The newest version!
package com.databricks.labs.automl.model

import com.databricks.labs.automl.model.tools.split.PerformanceSettings
import com.databricks.labs.automl.params.{
  Defaults,
  EvolutionDefaults,
  KSampleConfig,
  RandomForestConfig
}
import com.databricks.labs.automl.utils.{
  DataValidation,
  SeedConverters,
  SparkSessionWrapper
}
import org.apache.spark.ml.evaluation.{
  BinaryClassificationEvaluator,
  MulticlassClassificationEvaluator,
  RegressionEvaluator
}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable.ArrayBuffer
import scala.reflect.runtime.universe._

trait Evolution
    extends DataValidation
    with EvolutionDefaults
    with SeedConverters
    with SparkSessionWrapper
    with Defaults {

  var _labelCol: String = _defaultLabel
  var _featureCol: String = _defaultFeature
  var _trainPortion: Double = _defaultTrainPortion
  var _trainSplitMethod: String = _defaultTrainSplitMethod
  var _kSampleConfig: KSampleConfig = _defaultKSampleConfig
  var _trainSplitChronologicalColumn: String =
    _defaultTrainSplitChronologicalColumn
  var _trainSplitChronologicalRandomPercentage: Double =
    _defaultTrainSplitChronologicalRandomPercentage
  var _parallelism: Int = _defaultParallelism
  var _kFold: Int = _defaultKFold
  var _seed: Long = _defaultSeed
  var _kFoldIteratorRange: scala.collection.parallel.immutable.ParRange =
    Range(0, _kFold).par
  var _fieldsToIgnore = _defaultFieldsToIgnoreInVector
  var _optimizationStrategy: String = _defaultOptimizationStrategy
  var _firstGenerationGenePool: Int = _defaultFirstGenerationGenePool
  var _numberOfMutationGenerations: Int = _defaultNumberOfMutationGenerations
  var _numberOfParentsToRetain: Int = _defaultNumberOfParentsToRetain
  var _numberOfMutationsPerGeneration: Int =
    _defaultNumberOfMutationsPerGeneration
  var _geneticMixing: Double = _defaultGeneticMixing
  var _generationalMutationStrategy: String =
    _defaultGenerationalMutationStrategy
  var _mutationMagnitudeMode: String = _defaultMutationMagnitudeMode
  var _fixedMutationValue: Int = _defaultFixedMutationValue
  var _earlyStoppingScore: Double = _defaultEarlyStoppingScore
  var _earlyStoppingFlag: Boolean = _defaultEarlyStoppingFlag

  var _evolutionStrategy: String = _defaultEvolutionStrategy
  var _geneticMBOCandidateFactor: Int = _defaultGeneticMBOCandidateFactor
  var _geneticMBORegressorType: String = _defaultGeneticMBORegressorType
  var _continuousEvolutionImprovementThreshold: Int =
    _defaultContinuousEvolutionImprovementThreshold
  var _continuousEvolutionMaxIterations: Int =
    _defaultContinuousEvolutionMaxIterations
  var _continuousEvolutionStoppingScore: Double =
    _defaultContinuousEvolutionStoppingScore
  var _continuousEvolutionParallelism: Int =
    _defaultContinuousEvolutionParallelism
  var _continuousEvolutionMutationAggressiveness: Int =
    _defaultContinuousEvolutionMutationAggressiveness
  var _continuousEvolutionGeneticMixing: Double =
    _defaultContinuousEvolutionGeneticMixing
  var _continuousEvolutionRollingImprovementCount: Int =
    _defaultContinuousEvolutionRollingImprovementCount

  var _initialGenerationMode: String = _defaultFirstGenMode
  var _initialGenerationPermutationCount: Int = _defaultFirstGenPermutations
  var _initialGenerationIndexMixingMode: String =
    _defaultFirstGenIndexMixingMode
  var _initialGenerationArraySeed: Long = _defaultFirstGenArraySeed
  var _hyperSpaceModelCount: Int = _defaultHyperSpaceModelCount

  var _modelSeedSet: Boolean = false
  var _modelSeed: Map[String, Any] = Map.empty

  var _dataReduce: Double = _defaultDataReduce

  var _syntheticCol: String = _defaultKSampleConfig.syntheticCol
  var _kGroups: Int = _defaultKSampleConfig.kGroups
  var _kMeansMaxIter: Int = _defaultKSampleConfig.kMeansMaxIter
  var _kMeansTolerance: Double = _defaultKSampleConfig.kMeansTolerance
  var _kMeansDistanceMeasurement: String =
    _defaultKSampleConfig.kMeansDistanceMeasurement
  var _kMeansSeed: Long = _defaultKSampleConfig.kMeansSeed
  var _kMeansPredictionCol: String = _defaultKSampleConfig.kMeansPredictionCol
  var _lshHashTables: Int = _defaultKSampleConfig.lshHashTables
  var _lshSeed: Long = _defaultKSampleConfig.lshSeed
  var _lshOutputCol: String = _defaultKSampleConfig.lshOutputCol
  var _quorumCount: Int = _defaultKSampleConfig.quorumCount
  var _minimumVectorCountToMutate: Int =
    _defaultKSampleConfig.minimumVectorCountToMutate
  var _vectorMutationMethod: String = _defaultKSampleConfig.vectorMutationMethod
  var _mutationMode: String = _defaultKSampleConfig.mutationMode
  var _mutationValue: Double = _defaultKSampleConfig.mutationValue
  var _labelBalanceMode: String = _defaultKSampleConfig.labelBalanceMode
  var _cardinalityThreshold: Int = _defaultKSampleConfig.cardinalityThreshold
  var _numericRatio: Double = _defaultKSampleConfig.numericRatio
  var _numericTarget: Int = _defaultKSampleConfig.numericTarget

  var _randomizer: scala.util.Random = scala.util.Random
  _randomizer.setSeed(_seed)

  def setLabelCol(value: String): this.type = {
    _labelCol = value
    this
  }

  def setFeaturesCol(value: String): this.type = {
    _featureCol = value
    this
  }

  def setFieldsToIgnore(value: Array[String]): this.type = {
    _fieldsToIgnore = value
    this
  }

  def setTrainPortion(value: Double): this.type = {
    require(
      value < 1.0 & value > 0.0,
      "Training portion must be in the range > 0 and < 1"
    )
    _trainPortion = value
    this
  }

  def setTrainSplitMethod(value: String): this.type = {
    require(
      allowableTrainSplitMethod.contains(value),
      s"TrainSplitMethod $value must be one of: ${allowableTrainSplitMethod.mkString(", ")}"
    )
    _trainSplitMethod = value
    this
  }

  /**
    * Setter - for setting the name of the Synthetic column name
    * @param value String: A column name that is uniquely not part of the main DataFrame
    * @since 0.5.1
    * @author Ben Wilson
    */
  def setSyntheticCol(value: String): this.type = {
    _syntheticCol = value
    this
  }

  /**
    * Setter for specifying the number of K-Groups to generate in the KMeans model
    * @param value Int: number of k groups to generate
    * @return this
    */
  def setKGroups(value: Int): this.type = {
    _kGroups = value
    this
  }

  /**
    * Setter for specifying the maximum number of iterations for the KMeans model to go through to converge
    * @param value Int: Maximum limit on iterations
    * @return this
    */
  def setKMeansMaxIter(value: Int): this.type = {
    _kMeansMaxIter = value
    this
  }

  /**
    * Setter for Setting the tolerance for KMeans (must be >0)
    * @param value The tolerance value setting for KMeans
    * @see reference: [[http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.clustering.KMeans]]
    *      for further details.
    * @return this
    * @throws IllegalArgumentException() if a value less than 0 is entered
    */
  @throws(classOf[IllegalArgumentException])
  def setKMeansTolerance(value: Double): this.type = {
    require(
      value > 0,
      s"KMeans tolerance value ${value.toString} is out of range.  Must be > 0."
    )
    _kMeansTolerance = value
    this
  }

  /**
    * Setter for which distance measurement to use to calculate the nearness of vectors to a centroid
    * @param value String: Options -> "euclidean" or "cosine" Default: "euclidean"
    * @return this
    * @throws IllegalArgumentException() if an invalid value is entered
    */
  @throws(classOf[IllegalArgumentException])
  def setKMeansDistanceMeasurement(value: String): this.type = {
    require(
      allowableKMeansDistanceMeasurements.contains(value),
      s"Kmeans Distance Measurement $value is not " +
        s"a valid mode of operation.  Must be one of: ${allowableKMeansDistanceMeasurements.mkString(", ")}"
    )
    _kMeansDistanceMeasurement = value
    this
  }

  /**
    * Setter for a KMeans seed for the clustering algorithm
    * @param value Long: Seed value
    * @return this
    */
  def setKMeansSeed(value: Long): this.type = {
    _kMeansSeed = value
    this
  }

  /**
    * Setter for the internal KMeans column for cluster membership attribution
    * @param value String: column name for internal algorithm column for group membership
    * @return this
    */
  def setKMeansPredictionCol(value: String): this.type = {
    _kMeansPredictionCol = value
    this
  }

  /**
    * Setter for Configuring the number of Hash Tables to use for MinHashLSH
    * @param value Int: Count of hash tables to use
    * @see [[http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.MinHashLSH]]
    *     for more information
    * @return this
    */
  def setLSHHashTables(value: Int): this.type = {
    _lshHashTables = value
    this
  }

  /**
    * Setter for the LSH Seed for the model
    * @param value Long: Seed value
    * @return this
    */
  def setLSHSeed(value: Long): this.type = {
    _lshSeed = value
    this
  }

  /**
    * Setter for the internal LSH output hash information column
    * @param value String: column name for the internal MinHashLSH Model transformation value
    * @return this
    */
  def setLSHOutputCol(value: String): this.type = {
    _lshOutputCol = value
    this
  }

  /**
    * Setter for how many vectors to find in adjacency to the centroid for generation of synthetic data
    * @note the higher the value set here, the higher the variance in synthetic data generation
    * @param value Int: Number of vectors to find nearest each centroid within the class
    * @return this
    */
  def setQuorumCount(value: Int): this.type = {
    _quorumCount = value
    this
  }

  /**
    * Setter for minimum threshold for vector indexes to mutate within the feature vector.
    * @note In vectorMutationMethod "fixed" this sets the fixed count of how many vector positions to mutate.
    *       In vectorMutationMethod "random" this sets the lower threshold for 'at least this many indexes will
    *       be mutated'
    * @param value The minimum (or fixed) number of indexes to mutate.
    * @return this
    */
  def setMinimumVectorCountToMutate(value: Int): this.type = {
    _minimumVectorCountToMutate = value
    this
  }

  /**
    * Setter for the Vector Mutation Method
    * @note Options:
    *       "fixed" - will use the value of minimumVectorCountToMutate to select random indexes of this number of indexes.
    *       "random" - will use this number as a lower bound on a random selection of indexes between this and the vector length.
    *       "all" - will mutate all of the vectors.
    * @param value String - the mode to use.
    * @return this
    * @throws IllegalArgumentException() if the mode is not supported.
    */
  @throws(classOf[IllegalArgumentException])
  def setVectorMutationMethod(value: String): this.type = {
    require(
      allowableVectorMutationMethods.contains(value),
      s"Vector Mutation Mode $value is not supported.  " +
        s"Must be one of: ${allowableVectorMutationMethods.mkString(", ")} "
    )
    _vectorMutationMethod = value
    this
  }

  /**
    * Setter for the Mutation Mode of the feature vector individual values
    * @note Options:
    *       "weighted" - uses weighted averaging to scale the euclidean distance between the centroid vector and mutation candidate vectors
    *       "random" - randomly selects a position on the euclidean vector between the centroid vector and the candidate mutation vectors
    *       "ratio" - uses a ratio between the values of the centroid vector and the mutation vector    *
    * @param value String: the mode to use.
    * @return this
    * @throws IllegalArgumentException() if the mode is not supported.
    */
  @throws(classOf[IllegalArgumentException])
  def setMutationMode(value: String): this.type = {
    require(
      allowableMutationModes.contains(value),
      s"Mutation Mode $value is not a valid mode of operation.  " +
        s"Must be one of: ${allowableMutationModes.mkString(", ")}"
    )
    _mutationMode = value
    this
  }

  /**
    * Setter for specifying the mutation magnitude for the modes 'weighted' and 'ratio' in mutationMode
    * @param value Double: value between 0 and 1 for mutation magnitude adjustment.
    * @note the higher this value, the closer to the centroid vector vs. the candidate mutation vector the synthetic row data will be.
    * @return this
    * @throws IllegalArgumentException() if the value specified is outside of the range (0, 1)
    */
  @throws(classOf[IllegalArgumentException])
  def setMutationValue(value: Double): this.type = {
    require(
      value > 0 & value < 1,
      s"Mutation Value must be between 0 and 1. Value $value is not permitted."
    )
    _mutationValue = value
    this
  }

  /**
    * Setter - for determining the label balance approach mode.
    * @note Available modes: 
* 'match': Will match all smaller class counts to largest class count. [WARNING] - May significantly increase memory pressure!
* 'percentage' Will adjust smaller classes to a percentage value of the largest class count. * 'target' Will increase smaller class counts to a fixed numeric target of rows. * @param value String: one of: 'match', 'percentage' or 'target' * @note Default: "percentage" * @since 0.5.1 * @author Ben Wilson * @throws UnsupportedOperationException() if the provided mode is not supported. */ @throws(classOf[UnsupportedOperationException]) def setLabelBalanceMode(value: String): this.type = { require( allowableLabelBalanceModes.contains(value), s"Label Balance Mode $value is not supported." + s"Must be one of: ${allowableLabelBalanceModes.mkString(", ")}" ) _labelBalanceMode = value this } /** * Setter - for overriding the cardinality threshold exception threshold. [WARNING] increasing this value on * a sufficiently large data set could incur, during runtime, excessive memory and cpu pressure on the cluster. * @param value Int: the limit above which an exception will be thrown for a classification problem wherein the * label distinct count is too large to successfully generate synthetic data. * @note Default: 20 * @since 0.5.1 * @author Ben Wilson */ def setCardinalityThreshold(value: Int): this.type = { _cardinalityThreshold = value this } /** * Setter - for specifying the percentage ratio for the mode 'percentage' in setLabelBalanceMode() * @param value Double: A fractional double in the range of 0.0 to 1.0. * @note Setting this value to 1.0 is equivalent to setting the label balance mode to 'match' * @note Default: 0.2 * @since 0.5.1 * @author Ben Wilson * @throws UnsupportedOperationException() if the provided value is outside of the range of 0.0 -> 1.0 */ @throws(classOf[UnsupportedOperationException]) def setNumericRatio(value: Double): this.type = { require( value <= 1.0 & value > 0.0, s"Invalid Numeric Ratio entered! Must be between 0 and 1." + s"${value.toString} is not valid." ) _numericRatio = value this } /** * Setter - for specifying the target row count to generate for 'target' mode in setLabelBalanceMode() * @param value Int: The desired final number of rows per minority class label * @note [WARNING] Setting this value to too high of a number will greatly increase runtime and memory pressure. * @since 0.5.1 * @author Ben Wilson */ def setNumericTarget(value: Int): this.type = { _numericTarget = value this } def setTrainSplitChronologicalColumn(value: String): this.type = { _trainSplitChronologicalColumn = value this } def setTrainSplitChronologicalRandomPercentage(value: Double): this.type = { _trainSplitChronologicalRandomPercentage = value if (value > 10) println( "[WARNING] setTrainSplitChronologicalRandomPercentage() setting this value above 10 " + "percent will cause significant per-run train/test skew and variability in row counts during training. " + "Use higher values only if this is desired." ) this } def setParallelism(value: Int): this.type = { require( _parallelism < 10000, s"Parallelism above 10000 will result in cluster instability." ) _parallelism = value this } def setKFold(value: Int): this.type = { _kFold = value _kFoldIteratorRange = Range(0, _kFold).par this } def setSeed(value: Long): this.type = { _seed = value this } def setOptimizationStrategy(value: String): this.type = { val valueLC = value.toLowerCase require( allowableOptimizationStrategies.contains(valueLC), s"Optimization Strategy '$valueLC' is not a member of ${invalidateSelection(valueLC, allowableOptimizationStrategies)}" ) _optimizationStrategy = valueLC this } def setFirstGenerationGenePool(value: Int): this.type = { require( value >= 5, s"Values less than 5 for firstGenerationGenePool will require excessive generational mutation to converge" ) _firstGenerationGenePool = value this } def setNumberOfMutationGenerations(value: Int): this.type = { require(value > 0, s"Number of Generations must be greater than 0") _numberOfMutationGenerations = value this } def setNumberOfParentsToRetain(value: Int): this.type = { require( value > 0, s"Number of Parents must be greater than 0. '$value' is not a valid number." ) _numberOfParentsToRetain = value this } def setNumberOfMutationsPerGeneration(value: Int): this.type = { require( value > 0, s"Number of Mutations per generation must be greater than 0. '$value' is not a valid number." ) _numberOfMutationsPerGeneration = value this } def setGeneticMixing(value: Double): this.type = { require( value < 1.0 & value > 0.0, s"Mutation Aggressiveness must be in range (0,1). Current Setting of $value is not permitted." ) _geneticMixing = value this } def setGenerationalMutationStrategy(value: String): this.type = { val valueLC = value.toLowerCase require( allowableMutationStrategies.contains(valueLC), s"Generational Mutation Strategy '$valueLC' is not a member of ${invalidateSelection(valueLC, allowableMutationStrategies)}" ) _generationalMutationStrategy = valueLC this } def setMutationMagnitudeMode(value: String): this.type = { val valueLC = value.toLowerCase require( allowableMutationMagnitudeMode.contains(valueLC), s"Mutation Magnitude Mode '$valueLC' is not a member of ${invalidateSelection(valueLC, allowableMutationMagnitudeMode)}" ) _mutationMagnitudeMode = valueLC this } def setFixedMutationValue(value: Int): this.type = { val maxMutationCount = modelConfigLength[RandomForestConfig] require( value <= maxMutationCount, s"Mutation count '$value' cannot exceed number of hyperparameters ($maxMutationCount)" ) require(value > 0, s"Mutation count '$value' must be greater than 0") _fixedMutationValue = value this } def setEarlyStoppingScore(value: Double): this.type = { _earlyStoppingScore = value this } def setEarlyStoppingFlag(value: Boolean): this.type = { _earlyStoppingFlag = value this } def setEvolutionStrategy(value: String): this.type = { require( allowableEvolutionStrategies.contains(value), s"Evolution Strategy '$value' is not a supported mode. Must be one of: ${invalidateSelection(value, allowableEvolutionStrategies)}" ) _evolutionStrategy = value this } /** * Setter for defining the secondary stopping criteria for continuous training mode ( number of consistentlt * not-improving runs to terminate the learning algorithm due to diminishing returns. * @param value Negative Integer (an improvement to a priori will reset the counter and subsequent non-improvements * will decrement a mutable counter. If the counter hits this limit specified in value, the continuous * mode algorithm will stop). * @author Ben Wilson, Databricks * @since 0.6.0 * @throws IllegalArgumentException if the value is positive. */ @throws(classOf[IllegalArgumentException]) def setContinuousEvolutionImprovementThreshold(value: Int): this.type = { require( value < 0, s"ContinuousEvolutionImprovementThreshold must be less than zero. It is " + s"recommended to set this value to less than -4." ) _continuousEvolutionImprovementThreshold = value this } /** * Setter for selecting the type of Regressor to use for the within-epoch generation MBO of candidates * @param value String - one of "XGBoost", "LinearRegression" or "RandomForest" * @author Ben Wilson, Databricks * @since 0.6.0 * @throws IllegalArgumentException if the value is not supported */ @throws(classOf[IllegalArgumentException]) def setGeneticMBORegressorType(value: String): this.type = { require( allowableMBORegressorTypes.contains(value), s"GeneticRegressorType $value is not a supported Regressor " + s"Type. Must be one of: ${allowableMBORegressorTypes.mkString(", ")}" ) _geneticMBORegressorType = value this } /** * Setter for defining the factor to be applied to the candidate listing of hyperparameters to generate through * mutation for each generation other than the initial and post-modeling optimization phases. The larger this * value (default: 10), the more potential space can be searched. There is not a large performance hit to this, * and as such, values in excess of 100 are viable. * @param value Int - a factor to multiply the numberOfMutationsPerGeneration by to generate a count of potential * candidates. * @author Ben Wilson, Databricks * @since 0.6.0 * @throws IllegalArgumentException if the value is not greater than zero. */ @throws(classOf[IllegalArgumentException]) def setGeneticMBOCandidateFactor(value: Int): this.type = { require(value > 0, s"GeneticMBOCandidateFactor must be greater than zero.") _geneticMBOCandidateFactor = value this } def setContinuousEvolutionMaxIterations(value: Int): this.type = { if (value > 500) println( s"[WARNING] Total Modeling count $value is higher than recommended limit of 500. " + s"This tuning will take a long time to run." ) _continuousEvolutionMaxIterations = value this } def setContinuousEvolutionStoppingScore(value: Double): this.type = { _continuousEvolutionStoppingScore = value this } def setContinuousEvolutionParallelism(value: Int): this.type = { if (value > 10) println( s"[WARNING] ContinuousEvolutionParallelism -> $value is higher than recommended " + s"concurrency for efficient optimization for convergence." + s"\n Setting this value below 11 will converge faster in most cases." ) _continuousEvolutionParallelism = value this } def setContinuousEvolutionMutationAggressiveness(value: Int): this.type = { if (value > 4) println( s"[WARNING] ContinuousEvolutionMutationAggressiveness -> $value. " + s"\n Setting this higher than 4 will result in extensive random search and will take longer to converge " + s"to optimal hyperparameters." ) _continuousEvolutionMutationAggressiveness = value this } def setContinuousEvolutionGeneticMixing(value: Double): this.type = { require( value < 1.0 & value > 0.0, s"Mutation Aggressiveness must be in range (0,1). Current Setting of $value is not permitted." ) _continuousEvolutionGeneticMixing = value this } def setContinuousEvolutionRollingImporvementCount(value: Int): this.type = { require( value > 0, s"ContinuousEvolutionRollingImprovementCount must be > 0. $value is invalid." ) if (value < 10) println( s"[WARNING] ContinuousEvolutionRollingImprovementCount -> $value setting is low. " + s"Optimal Convergence may not occur due to early stopping." ) _continuousEvolutionRollingImprovementCount = value this } def setModelSeed(value: Map[String, Any]): this.type = { _modelSeed = value _modelSeedSet = true this } def setDataReductionFactor(value: Double): this.type = { require(value > 0, s"Data Reduction Factor must be between 0 and 1") require(value < 1, s"Data Reduction Factor must be between 0 and 1") _dataReduce = value this } def setFirstGenMode(value: String): this.type = { require( allowableInitialGenerationModes.contains(value), s"First Generation Mode '$value' is not a supported mode." + s" Must be one of: ${invalidateSelection(value, allowableInitialGenerationModes)}" ) _initialGenerationMode = value this } def setFirstGenPermutations(value: Int): this.type = { _initialGenerationPermutationCount = value this } def setHyperSpaceModelCount(value: Int): this.type = { _hyperSpaceModelCount = value this } def setFirstGenIndexMixingMode(value: String): this.type = { require( allowableInitialGenerationIndexMixingModes.contains(value), s"First Generation Mode '$value' is not a" + s"supported mode. Must be one of ${invalidateSelection(value, allowableInitialGenerationIndexMixingModes)}" ) _initialGenerationIndexMixingMode = value this } def setFirstGenArraySeed(value: Long): this.type = { _initialGenerationArraySeed = value this } def getFirstGenArraySeed: Long = _initialGenerationArraySeed def getFirstGenIndexMixingMode: String = _initialGenerationIndexMixingMode def getFirstGenPermutations: Int = _initialGenerationPermutationCount def getFirstGenMode: String = _initialGenerationMode def getHyperSpaceModelCount: Int = _hyperSpaceModelCount def getLabelCol: String = _labelCol def getFeaturesCol: String = _featureCol def getFieldsToIgnore: Array[String] = _fieldsToIgnore def getTrainPortion: Double = _trainPortion def getTrainSplitMethod: String = _trainSplitMethod def getTrainSplitChronologicalColumn: String = _trainSplitChronologicalColumn def getTrainSplitChronologicalRandomPercentage: Double = _trainSplitChronologicalRandomPercentage def getParallelism: Int = _parallelism def getKFold: Int = _kFold def getSeed: Long = _seed def getOptimizationStrategy: String = _optimizationStrategy def getFirstGenerationGenePool: Int = _firstGenerationGenePool def getNumberOfMutationGenerations: Int = _numberOfMutationGenerations def getNumberOfParentsToRetain: Int = _numberOfParentsToRetain def getNumberOfMutationsPerGeneration: Int = _numberOfMutationsPerGeneration def getGeneticMixing: Double = _geneticMixing def getGenerationalMutationStrategy: String = _generationalMutationStrategy def getMutationMagnitudeMode: String = _mutationMagnitudeMode def getFixedMutationValue: Int = _fixedMutationValue def getEarlyStoppingScore: Double = _earlyStoppingScore def getEarlyStoppingFlag: Boolean = _earlyStoppingFlag def getEvolutionStrategy: String = _evolutionStrategy def getGeneticMBORegressorType: String = _geneticMBORegressorType def getGeneticMBOCandidateFactor: Int = _geneticMBOCandidateFactor def getContinuousEvolutionImprovementThreshold: Int = _continuousEvolutionImprovementThreshold def getContinuousEvolutionMaxIterations: Int = _continuousEvolutionMaxIterations def getContinuousEvolutionStoppingScore: Double = _continuousEvolutionStoppingScore def getContinuousEvolutionParallelism: Int = _continuousEvolutionParallelism def getContinuousEvolutionMutationAggressiveness: Int = _continuousEvolutionMutationAggressiveness def getContinuousEvolutionGeneticMixing: Double = _continuousEvolutionGeneticMixing def getContinuousEvolutionRollingImporvementCount: Int = _continuousEvolutionRollingImprovementCount def getModelSeed: Map[String, Any] = _modelSeed def getDataReductionFactor: Double = _dataReduce // DEBUG for logging purposes of configurations. def debugSettings: String = { s"DEBUG: \n Evolution.scala --> xgbWorkers: ${PerformanceSettings.xgbWorkers(_parallelism)} \n " + s"Evolution.scala --> totalCores: ${PerformanceSettings.totalCores} \n " + s"Evolution.scala --> _parallelism: ${_parallelism} \n " + s"Evolution.scala --> getParallelism: ${getParallelism} \n " + s"Evolution.scala --> optimalJVMModelPartitions: ${PerformanceSettings .optimalJVMModelPartitions(_parallelism)} \n " + s"Evolution.scala --> parTasks: ${PerformanceSettings.parTasks}" } /** * Internal method for validating if a numeric mapping that is specified contains any invalid keys * @param standardConfig The static defined numeric mapping for a model type * @param modConfig a user-specified mapping override * @since 0.6.1 * @author Ben Wilson, Databricks * @throws IllegalArgumentException if the key is invalid for the model type specified. */ @throws(classOf[IllegalArgumentException]) protected[model] def validateNumericMapping( standardConfig: Map[String, (Double, Double)], modConfig: Map[String, (Double, Double)] ): Unit = { val staticKeys = standardConfig.keys.toArray val modKeys = modConfig.keys.toArray modKeys.foreach( x => if (!staticKeys.contains(x)) throw new IllegalArgumentException( s"The numeric Boundary map key " + s"supplied: [$x] is not a valid member of Numeric Mapping. " + s"\nKeys are restricted to: [${staticKeys.mkString(", ")}]" ) ) } /** * Internal method for validating if a string mapping that is specified contains any invalid keys * @param standardConfig The static defined string mapping for a model type * @param modConfig a user-specified mapping override * @since 0.6.1 * @author Ben Wilson, Databricks * @throws IllegalArgumentException if the key is invalid for the model type specified. */ @throws(classOf[IllegalArgumentException]) protected[model] def validateStringMapping( standardConfig: Map[String, List[String]], modConfig: Map[String, List[String]] ): Unit = { val staticKeys = standardConfig.keys.toArray val modKeys = modConfig.keys.toArray modKeys.foreach( x => if (!staticKeys.contains(x)) throw new IllegalArgumentException( s"The string Boundary map key " + s"supplied: [$x] is not a valid member of String Mapping. " + s"\nKeys are restricted to: [${staticKeys.mkString(", ")}]" ) ) } /** * Helper function for partially updating a numeric mapping * @param defaultMap The default configuration Map for a numeric mapping for model hyperparameter search space * @param updateMap user-supplied updated map (doesn't have to have all elements in it) * @return The default map, updated with the user-supplied overrides * @since 0.6.1 * @author Ben Wilson, Jas Bali Databricks */ def partialOverrideNumericMapping( defaultMap: Map[String, (Double, Double)], updateMap: Map[String, (Double, Double)] ): Map[String, (Double, Double)] = { defaultMap ++ updateMap } /** * Helper function for partially updating a string mapping * * @param defaultMap The default configuration Map for a string mapping for model hyperparameter search space * @param updateMap user-supplied updated map (doesn't have to have all elements in it) * @return The default map, updated with the user-supplied overrides * @since 0.6.1 * @author Ben Wilson, Jas Bali Databricks */ def partialOverrideStringMapping( defaultMap: Map[String, List[String]], updateMap: Map[String, List[String]] ): Map[String, List[String]] = { defaultMap ++ updateMap } // TODO - Calculation should take into account early stopping def totalModels: Int = _evolutionStrategy match { case "batch" => (_numberOfMutationsPerGeneration * _numberOfMutationGenerations) + _firstGenerationGenePool + _initialGenerationPermutationCount + _hyperSpaceModelCount case "continuous" => _continuousEvolutionMaxIterations - _continuousEvolutionParallelism + _firstGenerationGenePool case _ => throw new MatchError( s"EvolutionStrategy mode ${_evolutionStrategy} is not supported." + s"\n Choose one of: ${allowableEvolutionStrategies.mkString(", ")}" ) } def modelConfigLength[T: TypeTag]: Int = { typeOf[T].members .collect { case m: MethodSymbol if m.isCaseAccessor => m } .toList .length } def extractBoundaryDouble( param: String, boundaryMap: Map[String, (AnyVal, AnyVal)] ): (Double, Double) = { val minimum = boundaryMap(param)._1.asInstanceOf[Double] val maximum = boundaryMap(param)._2.asInstanceOf[Double] (minimum, maximum) } def extractBoundaryInteger( param: String, boundaryMap: Map[String, (AnyVal, AnyVal)] ): (Int, Int) = { val minimum = boundaryMap(param)._1.asInstanceOf[Double].toInt val maximum = boundaryMap(param)._2.asInstanceOf[Double].toInt (minimum, maximum) } def generateRandomDouble( param: String, boundaryMap: Map[String, (AnyVal, AnyVal)] ): Double = { val (minimumValue, maximumValue) = extractBoundaryDouble(param, boundaryMap) minimumValue + _randomizer.nextDouble() * (maximumValue - minimumValue) } def generateRandomInteger(param: String, boundaryMap: Map[String, (AnyVal, AnyVal)]): Int = { val (minimumValue, maximumValue) = extractBoundaryInteger(param, boundaryMap) _randomizer.nextInt(maximumValue - minimumValue) + minimumValue } def generateRandomString(param: String, boundaryMap: Map[String, List[String]]): String = { _randomizer.shuffle(boundaryMap(param)).head } def coinFlip(): Boolean = { math.random < 0.5 } def coinFlip(parent: Boolean, child: Boolean, p: Double): Boolean = { if (math.random < p) parent else child } def buildLayerArray(inputFeatureSize: Int, distinctClasses: Int, nLayers: Int, hiddenLayerSizeAdjust: Int): Array[Int] = { val layerConstruct = new ArrayBuffer[Int] layerConstruct += inputFeatureSize (1 to nLayers).foreach { x => layerConstruct += inputFeatureSize + nLayers - x + hiddenLayerSizeAdjust } layerConstruct += distinctClasses layerConstruct.result.toArray } def generateLayerArray(layerParam: String, layerSizeParam: String, boundaryMap: Map[String, (AnyVal, AnyVal)], inputFeatureSize: Int, distinctClasses: Int): Array[Int] = { val layersToGenerate = generateRandomInteger(layerParam, boundaryMap) val hiddenLayerSizeAdjust = generateRandomInteger(layerSizeParam, boundaryMap) buildLayerArray( inputFeatureSize, distinctClasses, layersToGenerate, hiddenLayerSizeAdjust ) } def getRandomIndeces(minimum: Int, maximum: Int, parameterCount: Int): List[Int] = { val fullIndexArray = List.range(0, maximum) val randomSeed = new scala.util.Random val count = minimum + randomSeed.nextInt((parameterCount - minimum) + 1) val adjCount = if (count < 1) 1 else count val shuffledArray = scala.util.Random.shuffle(fullIndexArray).take(adjCount) shuffledArray.sortWith(_ < _) } def getFixedIndeces(minimum: Int, maximum: Int, parameterCount: Int): List[Int] = { val fullIndexArray = List.range(0, maximum) val randomSeed = new scala.util.Random randomSeed.shuffle(fullIndexArray).take(parameterCount).sortWith(_ < _) } def generateMutationIndeces(minimum: Int, maximum: Int, parameterCount: Int, mutationCount: Int): Array[List[Int]] = { val mutations = new ArrayBuffer[List[Int]] for (_ <- 0 to mutationCount) { _mutationMagnitudeMode match { case "random" => mutations += getRandomIndeces(minimum, maximum, parameterCount) case "fixed" => mutations += getFixedIndeces(minimum, maximum, parameterCount) case _ => new UnsupportedOperationException( s"Unsupported mutationMagnitudeMode ${_mutationMagnitudeMode}" ) } } mutations.result.toArray } def geneMixing(parent: Double, child: Double, parentMutationPercentage: Double): Double = { (parent * parentMutationPercentage) + (child * (1 - parentMutationPercentage)) } def geneMixing(parent: Int, child: Int, parentMutationPercentage: Double): Int = { ((parent * parentMutationPercentage) + (child * (1 - parentMutationPercentage))).toInt } def geneMixing(parent: String, child: String): String = { val mixed = new ArrayBuffer[String] mixed += parent += child scala.util.Random.shuffle(mixed.toList).head } def geneMixing(parent: Array[Int], child: Array[Int], parentMutationPercentage: Double): Array[Int] = { val staticStart = parent.head val staticEnd = parent.last val parentHiddenLayers = parent.length - 2 val childHiddenLayers = child.length - 2 val parentMagnitude = parent(1) - staticStart val childMagnidue = child(1) - staticStart val hiddenLayerMix = geneMixing( parentHiddenLayers, childHiddenLayers, parentMutationPercentage ) val sizeAdjustMix = geneMixing(parentMagnitude, childMagnidue, parentMutationPercentage) buildLayerArray(staticStart, staticEnd, hiddenLayerMix, sizeAdjustMix) } /** * Method for calculating the remaining time left on the genetic algorithm training (roughly) * @note Due to the asynchronous nature of the algorithm, the times are not exact and are a reflection of time * since the creation of the Futures and when they were initially inserted into the thread pool. * @param currentGen The current Generation that the model is running on * @param currentModel The index of the current model that is being run. * @return A Double representing the total completion percentage of the modeling portion of the run. * @since 0.2.1 * @author Ben Wilson */ def calculateModelingFamilyRemainingTime(currentGen: Int, currentModel: Int): Double = { val modelsComplete = _evolutionStrategy match { case "batch" => if (currentGen == 1) { currentModel } else { _firstGenerationGenePool + (_numberOfMutationsPerGeneration * (currentGen - 2) + currentModel) } case _ => currentGen + _firstGenerationGenePool } (modelsComplete.toDouble / totalModels.toDouble) * 100 } /** * Method for validating the distinct class count for a classification type model (for use in determining which * evaluator to employ for scoring and optimization of each model) * @param df source Dataframe (prior to splitting for train/test) * @return Boolean true for Binary Classification problem, false for multi-class problem * @since 0.4.0 * @author Ben Wilson */ def classificationAdjudicator(df: DataFrame): Boolean = { // Calculate the distinct entries of the label value for a classification problem val uniqueLabelCounts = df.select(_labelCol).distinct().count() if (uniqueLabelCounts <= 2) true else false } /** * Method for restricting the available metrics used or are available for optimizing for classification problems * @param binaryValidation boolean check from classificationAdjudicator() method * @param metricPayload the hard-coded allowable List[String] of allowable classification metrics * from com.databricks.labs.automl.params.EvolutionDefaults * @return a copy of the the allowable params list with the Binary metrics removed if this is a multiclass problem. * @since 0.4.0 * @author Ben Wilson */ def classificationMetricValidator( binaryValidation: Boolean, metricPayload: List[String] ): List[String] = { if (binaryValidation) { metricPayload } else { metricPayload.diff(List("areaUnderROC", "areaUnderPR")) } } /** * Method for scoring and evaluating classification models (supporting both multi-class and binary classification * problems) * @param metricName the metric to be tested against (both for binary and multi-class) * @param labelColumn the column name in the data set that is the 'source of truth' to compare against * @param data the DataFrame that has been transformed * @return the score, as a Double value. * @since 0.4.0 * @author Ben Wilson */ def classificationScoring(metricName: String, labelColumn: String, data: DataFrame): Double = { metricName match { case "areaUnderPR" | "areaUnderROC" => new BinaryClassificationEvaluator() .setLabelCol(labelColumn) .setRawPredictionCol("probability") .setMetricName(metricName) .evaluate(data) case _ => new MulticlassClassificationEvaluator() .setLabelCol(labelColumn) .setPredictionCol("prediction") .setMetricName(metricName) .evaluate(data) } } /** * Method for scoring Regression models. * @param metricName The metric desired to be tested * @param labelColumn The name of the label column * @param data the DataFrame that has been transformed by a model. * @return the score for the metric, as a Double value. * @since 0.4.0 * @author Ben Wilson */ def regressionScoring(metricName: String, labelColumn: String, data: DataFrame): Double = { new RegressionEvaluator() .setLabelCol(labelColumn) .setMetricName(metricName) .evaluate(data) } def generateAggressiveness(totalConfigs: Int, currentIteration: Int): Int = { val mutationAggressiveness = _generationalMutationStrategy match { case "linear" => if (totalConfigs - (currentIteration + 1) < 1) 1 else totalConfigs - (currentIteration + 1) case _ => _fixedMutationValue } mutationAggressiveness } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy