All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.databricks.labs.automl.executor.AutomationConfig.scala Maven / Gradle / Ivy

package com.databricks.labs.automl.executor

import com.databricks.labs.automl.params._
import com.databricks.labs.automl.sanitize.SanitizerDefaults

trait AutomationConfig extends Defaults with SanitizerDefaults {

  var _modelingFamily: String = _defaultModelingFamily

  var _labelCol: String = _defaultLabelCol

  var _featuresCol: String = _defaultFeaturesCol

  var _naFillFlag: Boolean = _defaultNAFillFlag

  var _varianceFilterFlag: Boolean = _defaultVarianceFilterFlag

  var _outlierFilterFlag: Boolean = _defaultOutlierFilterFlag

  var _pearsonFilterFlag: Boolean = _defaultPearsonFilterFlag

  var _covarianceFilterFlag: Boolean = _defaultCovarianceFilterFlag

  var _oneHotEncodeFlag: Boolean = _defaultOneHotEncodeFlag

  var _scalingFlag: Boolean = _defaultScalingFlag

  var _featureInteractionFlag: Boolean = _defaultFeatureInteractionFlag

  var _dataPrepCachingFlag: Boolean = _defaultDataPrepCachingFlag

  var _dataPrepParallelism: Int = _defaultDataPrepParallelism

  var _numericBoundaries: Map[String, (Double, Double)] =
    _rfDefaultNumBoundaries

  var _stringBoundaries: Map[String, List[String]] = _rfDefaultStringBoundaries

  var _scoringMetric: String = _scoringDefaultClassifier

  var _scoringOptimizationStrategy: String =
    _scoringOptimizationStrategyClassifier

  var _numericFillStat: String = _fillConfigDefaults.numericFillStat

  var _characterFillStat: String = _fillConfigDefaults.characterFillStat

  var _dateTimeConversionType: String = _defaultDateTimeConversionType

  var _fieldsToIgnoreInVector: Array[String] = _defaultFieldsToIgnoreInVector

  var _naFillFilterPrecision: Double = _fillConfigDefaults.filterPrecision

  var _categoricalNAFillMap: Map[String, String] =
    _fillConfigDefaults.categoricalNAFillMap

  var _numericNAFillMap: Map[String, AnyVal] =
    _fillConfigDefaults.numericNAFillMap

  var _characterNABlanketFillValue: String =
    _fillConfigDefaults.characterNABlanketFillValue

  var _numericNABlanketFillValue: Double =
    _fillConfigDefaults.numericNABlanketFillValue

  var _naFillMode: String = _fillConfigDefaults.naFillMode

  var _cardinalitySwitchFlag: Boolean = _fillConfigDefaults.cardinalitySwitch

  var _cardinalityType: String = _fillConfigDefaults.cardinalityType

  var _cardinalityLimit: Int = _fillConfigDefaults.cardinalityLimit

  var _cardinalityPrecision: Double = _fillConfigDefaults.cardinalityPrecision

  var _cardinalityCheckMode: String = _fillConfigDefaults.cardinalityCheckMode

  var _modelSelectionDistinctThreshold: Int =
    _fillConfigDefaults.modelSelectionDistinctThreshold

  var _fillConfig: FillConfig = _fillConfigDefaults

  var _filterBounds: String = _outlierConfigDefaults.filterBounds

  var _lowerFilterNTile: Double = _outlierConfigDefaults.lowerFilterNTile

  var _upperFilterNTile: Double = _outlierConfigDefaults.upperFilterNTile

  var _filterPrecision: Double = _outlierConfigDefaults.filterPrecision

  var _continuousDataThreshold: Int =
    _outlierConfigDefaults.continuousDataThreshold

  var _fieldsToIgnore: Array[String] = _outlierConfigDefaults.fieldsToIgnore

  var _outlierConfig: OutlierConfig = _outlierConfigDefaults

  var _pearsonFilterStatistic: String = _pearsonConfigDefaults.filterStatistic

  var _pearsonFilterDirection: String = _pearsonConfigDefaults.filterDirection

  var _pearsonFilterManualValue: Double =
    _pearsonConfigDefaults.filterManualValue

  var _pearsonFilterMode: String = _pearsonConfigDefaults.filterMode

  var _pearsonAutoFilterNTile: Double = _pearsonConfigDefaults.autoFilterNTile

  var _pearsonConfig: PearsonConfig = _pearsonConfigDefaults

  var _correlationCutoffLow: Double =
    _covarianceConfigDefaults.correlationCutoffLow

  var _correlationCutoffHigh: Double =
    _covarianceConfigDefaults.correlationCutoffHigh

  var _covarianceConfig: CovarianceConfig = _covarianceConfigDefaults

  var _scalerType: String = defaultScalerType

  var _scalerMin: Double = defaultScalerMin

  var _scalerMax: Double = defaultScalerMax

  var _standardScalerMeanFlag: Boolean = defaultStandardScalerMeanFlag

  var _standardScalerStdDevFlag: Boolean = defaultStandardScalerStdDevFlag

  var _pNorm: Double = defaultPNorm

  var _scalingConfig: ScalingConfig = _scalingConfigDefaults

  var _featureInteractionConfig: FeatureInteractionConfig =
    _defaultFeatureInteractionConfig

  var _parallelism: Int = _geneticTunerDefaults.parallelism

  var _kFold: Int = _geneticTunerDefaults.kFold

  var _trainPortion: Double = _geneticTunerDefaults.trainPortion

  var _trainSplitMethod: String = _geneticTunerDefaults.trainSplitMethod

  var _kSampleConfig: KSampleConfig = _geneticTunerDefaults.kSampleConfig

  var _syntheticCol: String = _geneticTunerDefaults.kSampleConfig.syntheticCol

  var _kGroups: Int = _geneticTunerDefaults.kSampleConfig.kGroups

  var _kMeansMaxIter: Int = _geneticTunerDefaults.kSampleConfig.kMeansMaxIter

  var _kMeansTolerance: Double =
    _geneticTunerDefaults.kSampleConfig.kMeansTolerance

  var _kMeansDistanceMeasurement: String =
    _geneticTunerDefaults.kSampleConfig.kMeansDistanceMeasurement

  var _kMeansSeed: Long = _geneticTunerDefaults.kSampleConfig.kMeansSeed

  var _kMeansPredictionCol: String =
    _geneticTunerDefaults.kSampleConfig.kMeansPredictionCol

  var _lshHashTables: Int = _geneticTunerDefaults.kSampleConfig.lshHashTables

  var _lshSeed: Long = _geneticTunerDefaults.kSampleConfig.lshSeed

  var _lshOutputCol: String = _geneticTunerDefaults.kSampleConfig.lshOutputCol

  var _quorumCount: Int = _geneticTunerDefaults.kSampleConfig.quorumCount

  var _minimumVectorCountToMutate: Int =
    _geneticTunerDefaults.kSampleConfig.minimumVectorCountToMutate

  var _vectorMutationMethod: String =
    _geneticTunerDefaults.kSampleConfig.vectorMutationMethod

  var _mutationMode: String = _geneticTunerDefaults.kSampleConfig.mutationMode

  var _mutationValue: Double = _geneticTunerDefaults.kSampleConfig.mutationValue

  var _labelBalanceMode: String =
    _geneticTunerDefaults.kSampleConfig.labelBalanceMode

  var _cardinalityThreshold: Int =
    _geneticTunerDefaults.kSampleConfig.cardinalityThreshold

  var _numericRatio: Double = _geneticTunerDefaults.kSampleConfig.numericRatio

  var _numericTarget: Int = _geneticTunerDefaults.kSampleConfig.numericTarget

  var _outputDfRepartitionScaleFactor: Int =
    _geneticTunerDefaults.kSampleConfig.outputDfRepartitionScaleFactor

  var _trainSplitChronologicalColumn: String =
    _geneticTunerDefaults.trainSplitChronologicalColumn

  var _trainSplitChronologicalRandomPercentage: Double =
    _geneticTunerDefaults.trainSplitChronologicalRandomPercentage

  var _trainSplitColumnSet: Boolean = false

  var _seed: Long = _geneticTunerDefaults.seed

  var _firstGenerationGenePool: Int =
    _geneticTunerDefaults.firstGenerationGenePool

  var _numberOfGenerations: Int = _geneticTunerDefaults.numberOfGenerations

  var _numberOfParentsToRetain: Int =
    _geneticTunerDefaults.numberOfParentsToRetain

  var _numberOfMutationsPerGeneration: Int =
    _geneticTunerDefaults.numberOfMutationsPerGeneration

  var _geneticMixing: Double = _geneticTunerDefaults.geneticMixing

  var _generationalMutationStrategy: String =
    _geneticTunerDefaults.generationalMutationStrategy

  var _fixedMutationValue: Int = _geneticTunerDefaults.fixedMutationValue

  var _mutationMagnitudeMode: String =
    _geneticTunerDefaults.mutationMagnitudeMode

  var _modelSeedMap: Map[String, Any] = Map.empty

  var _modelSeedSetStatus: Boolean = false

  var _firstGenerationConfig: FirstGenerationConfig =
    _defaultFirstGenerationConfig

  var _firstGenerationPermutationCount: Int =
    _geneticTunerDefaults.initialGenerationConfig.permutationCount

  var _firstGenerationIndexMixingMode: String =
    _geneticTunerDefaults.initialGenerationConfig.indexMixingMode

  var _firstGenerationArraySeed: Long =
    _geneticTunerDefaults.initialGenerationConfig.arraySeed

  var _hyperSpaceInference: Boolean = _defaultHyperSpaceInference

  var _hyperSpaceInferenceCount: Int = _defaultHyperSpaceInferenceCount

  var _hyperSpaceModelType: String = _defaultHyperSpaceModelType

  var _hyperSpaceModelCount: Int = _defaultHyperSpaceModelCount

  var _firstGenerationMode: String = _defaultInitialGenerationMode

  var _deltaCacheBackingDirectory: String =
    _geneticTunerDefaults.deltaCacheBackingDirectory

  var _splitCachingStrategy: String = _geneticTunerDefaults.splitCachingStrategy

  var _deltaCacheBackingDirectoryRemovalFlag: Boolean =
    _geneticTunerDefaults.deltaCacheBackingDirectoryRemovalFlag

  var _geneticConfig: GeneticConfig = _geneticTunerDefaults

  var _mainConfig: MainConfig = _mainConfigDefaults

  var _featureImportancesConfig: MainConfig = _featureImportancesDefaults

  var _treeSplitsConfig: MainConfig = _treeSplitDefaults

  var _mlFlowConfig: MLFlowConfig = _mlFlowConfigDefaults

  var _mlFlowLoggingFlag: Boolean = _defaultMlFlowLoggingFlag

  var _mlFlowArtifactsFlag: Boolean = _defaultMlFlowArtifactsFlag

  var _mlFlowTrackingURI: String = _mlFlowConfigDefaults.mlFlowTrackingURI

  var _mlFlowExperimentName: String = _mlFlowConfigDefaults.mlFlowExperimentName

  var _mlFlowAPIToken: String = _mlFlowConfigDefaults.mlFlowAPIToken

  var _mlFlowModelSaveDirectory: String =
    _mlFlowConfigDefaults.mlFlowModelSaveDirectory

  var _mlFlowLoggingMode: String = _mlFlowConfigDefaults.mlFlowLoggingMode

  var _mlFlowBestSuffix: String = _mlFlowConfigDefaults.mlFlowBestSuffix

  var _mlFlowCustomRunTags: Map[String, String] =
    _mlFlowConfigDefaults.mlFlowCustomRunTags

  var _autoStoppingFlag: Boolean = _defaultAutoStoppingFlag

  var _autoStoppingScore: Double = _defaultAutoStoppingScore

  var _featureImportanceCutoffType: String = _defaultFeatureImportanceCutoffType

  var _featureImportanceCutoffValue: Double =
    _defaultFeatureImportanceCutoffValue

  var _evolutionStrategy: String = _geneticTunerDefaults.evolutionStrategy

  var _continuousEvolutionImprovementThreshold: Int =
    _geneticTunerDefaults.continuousEvolutionImprovementThreshold

  var _geneticMBORegressorType: String =
    _geneticTunerDefaults.geneticMBORegressorType

  var _geneticMBOCandidateFactor: Int =
    _geneticTunerDefaults.geneticMBOCandidateFactor

  var _continuousEvolutionMaxIterations: Int =
    _geneticTunerDefaults.continuousEvolutionMaxIterations

  var _continuousEvolutionStoppingScore: Double =
    _geneticTunerDefaults.continuousEvolutionStoppingScore

  var _continuousEvolutionParallelism: Int =
    _geneticTunerDefaults.continuousEvolutionParallelism

  var _continuousEvolutionMutationAggressiveness: Int =
    _geneticTunerDefaults.continuousEvolutionMutationAggressiveness

  var _continuousEvolutionGeneticMixing: Double =
    _geneticTunerDefaults.continuousEvolutionGeneticMixing

  var _continuousEvolutionRollingImprovementCount: Int =
    _geneticTunerDefaults.continuousEvolutionRollingImprovementCount

  var _inferenceConfigSaveLocation: String = _inferenceConfigSaveLocationDefault

  var _dataReductionFactor: Double = _defaultDataReductionFactor

  var _pipelineDebugFlag: Boolean = _defaultPipelineDebugFlag

  var _featureInteractionRetentionMode: String =
    _defaultFeatureInteractionConfig.retentionMode
  var _featureInteractionContinuousDiscretizerBucketCount: Int =
    _defaultFeatureInteractionConfig.continuousDiscretizerBucketCount
  var _featureInteractionParallelism: Int =
    _defaultFeatureInteractionConfig.parallelism
  var _featureInteractionTargetInteractionPercentage: Double =
    _defaultFeatureInteractionConfig.targetInteractionPercentage

  var _pipelineId: String = _defaultPipelineId

  def setPipelineId(value: String): this.type = {
    _pipelineId = value
    this
  }

  private def setConfigs(): this.type = {
    setMainConfig()
  }

  def setModelingFamily(value: String): this.type = {
    _modelingFamily = value
    _numericBoundaries = value match {
      case "RandomForest"       => _rfDefaultNumBoundaries
      case "MLPC"               => _mlpcDefaultNumBoundaries
      case "Trees"              => _treesDefaultNumBoundaries
      case "GBT"                => _gbtDefaultNumBoundaries
      case "LinearRegression"   => _linearRegressionDefaultNumBoundaries
      case "LogisticRegression" => _logisticRegressionDefaultNumBoundaries
      case "SVM"                => _svmDefaultNumBoundaries
      case "XGBoost"            => _xgboostDefaultNumBoundaries
      case "gbmBinary" | "gbmMulti" | "gbmMultiOVA" | "gbmHuber" | "gbmFair" |
          "gbmLasso" | "gbmRidge" | "gbmPoisson" | "gbmQuantile" | "gbmMape" |
          "gbmTweedie" | "gbmGamma" =>
        _lightGBMDefaultNumBoundaries
      case _ =>
        throw new IllegalArgumentException(
          s"$value is an unsupported Model Type"
        )
    }
    _stringBoundaries = value match {
      case "RandomForest"       => _rfDefaultStringBoundaries
      case "MLPC"               => _mlpcDefaultStringBoundaries
      case "Trees"              => _treesDefaultStringBoundaries
      case "GBT"                => _gbtDefaultStringBoundaries
      case "LinearRegression"   => _linearRegressionDefaultStringBoundaries
      case "LogisticRegression" => _logisticRegressionDefaultStringBoundaries
      case "SVM"                => _svmDefaultStringBoundaries
      case "XGBoost"            => Map()
      case "gbmBinary" | "gbmMulti" | "gbmMultiOVA" | "gbmHuber" | "gbmFair" |
          "gbmLasso" | "gbmRidge" | "gbmPoisson" | "gbmQuantile" | "gbmMape" |
          "gbmTweedie" | "gbmGamma" =>
        _lightGBMDefaultStringBoundaries
      case _ =>
        throw new IllegalArgumentException(
          s"$value is an unsupported Model Type"
        )
    }
    setConfigs()
    this
  }

  def setLabelCol(value: String): this.type = {
    _labelCol = value
    setConfigs()
    this
  }

  def setFeaturesCol(value: String): this.type = {
    _featuresCol = value
    setConfigs()
    this
  }

  def naFillOn(): this.type = {
    _naFillFlag = true
    setConfigs()
    this
  }

  def naFillOff(): this.type = {
    _naFillFlag = false
    setConfigs()
    this
  }

  def varianceFilterOn(): this.type = {
    _varianceFilterFlag = true
    setConfigs()
    this
  }

  def varianceFilterOff(): this.type = {
    _varianceFilterFlag = false
    setConfigs()
    this
  }

  def outlierFilterOn(): this.type = {
    _outlierFilterFlag = true
    setConfigs()
    this
  }

  def outlierFilterOff(): this.type = {
    _outlierFilterFlag = false
    setConfigs()
    this
  }

  def pearsonFilterOn(): this.type = {
    _pearsonFilterFlag = true
    setConfigs()
    this
  }

  def pearsonFilterOff(): this.type = {
    _pearsonFilterFlag = false
    setConfigs()
    this
  }

  def covarianceFilterOn(): this.type = {
    _covarianceFilterFlag = true
    setConfigs()
    this
  }

  def covarianceFilterOff(): this.type = {
    _covarianceFilterFlag = false
    setConfigs()
    this
  }

  def oneHotEncodingOn(): this.type = {
    _oneHotEncodeFlag = true
    setConfigs()
    this
  }

  def oneHotEncodingOff(): this.type = {
    _oneHotEncodeFlag = false
    setConfigs()
    this
  }

  def scalingOn(): this.type = {
    _scalingFlag = true
    setConfigs()
    this
  }

  def scalingOff(): this.type = {
    _scalingFlag = false
    setConfigs()
    this
  }

  def dataPrepCachingOn(): this.type = {
    _dataPrepCachingFlag = true
    setConfigs()
    this
  }

  def dataPrepCachingOff(): this.type = {
    _dataPrepCachingFlag = false
    setConfigs()
    this
  }

  def featureInteractionOn(): this.type = {
    _featureInteractionFlag = true
    setConfigs()
    this
  }

  def featureInteractionOff(): this.type = {
    _featureInteractionFlag = false
    setConfigs()
    this
  }

  /**
    * Setter for defining the number of concurrent threads allocated to performing asynchronous data prep tasks within
    * the feature engineering aspect of this application.
    * @param value Int: A value that must be greater than zero.
    * @note This value has an upper limit, depending on driver size, that will restrict the efficacy of the asynchronous
    *       tasks within the pool.  Setting this too high may cause cluster instability.
    * @author Ben Wilson, Databricks
    * @since 0.6.0
    * @throws IllegalArgumentException if a value less than or equal to zero is supplied.
    */
  @throws(classOf[IllegalArgumentException])
  def setDataPrepParallelism(value: Int): this.type = {

    require(value > 0, s"DataPrepParallelism must be greater than zero.")
    _dataPrepParallelism = value
    setConfigs()
    this
  }

  def setNumericBoundaries(value: Map[String, (Double, Double)]): this.type = {
    _numericBoundaries = value
    setConfigs()
    this
  }

  def setStringBoundaries(value: Map[String, List[String]]): this.type = {
    _stringBoundaries = value
    setConfigs()
    this
  }

  def setScoringMetric(value: String): this.type = {
    val adjusted_value = value.toLowerCase
    val matched_value = adjusted_value match {
      case "f1"                => "f1"
      case "weightedprecision" => "weightedPrecision"
      case "weightedrecall"    => "weightedRecall"
      case "accuracy"          => "accuracy"
      case "areaunderpr"       => "areaUnderPR"
      case "areaunderroc"      => "areaUnderROC"
      case "rmse"              => "rmse"
      case "mse"               => "mse"
      case "r2"                => "r2"
      case "mae"               => "mae"
      case _ =>
        throw new IllegalArgumentException(
          s"Supplied Scoring Metric '${value}' is not supported. " +
            s"Must be one of: weightedPrecision, weightedRecall, accuracy, areaUnderPR, areaUnderROC, rmse, mse, r2, mae.'"
        )
    }
    _scoringMetric = matched_value
    setConfigs()
    this
  }

  def setScoringOptimizationStrategy(value: String): this.type = {
    require(
      Array("minimize", "maximize").contains(value),
      s"$value is not a member of allowed scoring optimizations: " +
        s"'minimize' or 'maximize'"
    )
    _scoringOptimizationStrategy = value
    setConfigs()
    this
  }

  def setNumericFillStat(value: String): this.type = {
    _numericFillStat = value
    setFillConfig()
    setConfigs()
    this
  }

  def setCharacterFillStat(value: String): this.type = {
    _characterFillStat = value
    setFillConfig()
    setConfigs()
    this
  }

  def setDateTimeConversionType(value: String): this.type = {
    _dateTimeConversionType = value
    setConfigs()
    this
  }

  def setFieldsToIgnoreInVector(value: Array[String]): this.type = {
    _fieldsToIgnoreInVector = value
    if (_trainSplitColumnSet)
      _fieldsToIgnoreInVector = _fieldsToIgnoreInVector :+ _trainSplitChronologicalColumn
    setConfigs()
    this
  }

  /**
    * Setter for defining the precision for calculating the model type as per the label column
    *
    * @note setting this value to zero (0) for a large regression problem will incur a long processing time and
    *       an expensive shuffle.
    * @param value Double: Precision accuracy for approximate distinct calculation.
    * @throws java.lang.AssertionError If the value is outside of the allowable range of {0, 1}
    * @since 0.5.2
    * @author Ben Wilson, Databricks
    */
  @throws(classOf[AssertionError])
  def setNAFillFilterPrecision(value: Double): this.type = {
    require(
      value >= 0,
      s"Filter Precision for NA Fill must be greater than or equal to 0."
    )
    require(
      value <= 1,
      s"Filter Precision for NA Fill must be less than or equal to 1."
    )
    _naFillFilterPrecision = value
    setFillConfig()
    setConfigs()
    this
  }

  /**
    * Setter for providing a map of [Column Name -> String Fill Value] for manual by-column overrides.  Any non-specified
    * fields in this map will utilize the "auto" statistics-based fill paradigm to calculate and fill any NA values
    * in non-numeric columns.
    *
    * @note if naFillMode is specified as using Map Fill modes, this setter or the numeric na fill map MUST be set.
    * @note If fields are specified in here that are not part of the DataFrame's schema, an exception will be thrown.
    * @param value Map[String, String]: Column Name as String -> Fill Value as String
    * @since 0.5.2
    * @author Ben Wilson, Databricks
    */
  def setCategoricalNAFillMap(value: Map[String, String]): this.type = {
    _categoricalNAFillMap = value
    setFillConfig()
    setConfigs()
    this
  }

  /**
    * Setter for providing a map of [Column Name -> AnyVal Fill Value] (must be numeric). Any non-specified
    * fields in this map will utilize the "auto" statistics-based fill paradigm to calculate and fill any NA values
    * in numeric columns.
    *
    * @note if naFillMode is specified as using Map Fill modes, this setter or the categorical na fill map MUST be set.
    * @note If fields are specified in here that are not part of the DataFrame's schema, an exception will be thrown.
    * @param value Map[String, AnyVal]: Column Name as String -> Fill Numeric Type Value
    * @since 0.5.2
    * @author Ben Wilson, Databricks
    */
  def setNumericNAFillMap(value: Map[String, AnyVal]): this.type = {
    _numericNAFillMap = value
    setFillConfig()
    setConfigs()
    this
  }

  /**
    * Setter for providing a 'blanket override' value (fill all found categorical columns' missing values with this
    * specified value).
    *
    * @param value String: A value to fill all categorical na values in the DataFrame with.
    * @since 0.5.2
    * @author Ben Wilson, Databricks
    */
  def setCharacterNABlanketFillValue(value: String): this.type = {
    _characterNABlanketFillValue = value
    setFillConfig()
    setConfigs()
    this
  }

  /**
    * Setter for providing a 'blanket override'  value (fill all found numeric columns' missing values with this
    * specified value)
    *
    * @param value Double: A value to fill all numeric na value in the DataFrame with.
    * @since 0.5.2
    * @author Ben Wilson, Databricks
    */
  def setNumericNABlanketFillValue(value: Double): this.type = {
    _numericNABlanketFillValue = value
    setFillConfig()
    setConfigs()
    this
  }

  /**
    * Mode for na fill
* Available modes:
* auto : Stats-based na fill for fields. Usage of .setNumericFillStat and * .setCharacterFillStat will inform the type of statistics that will be used to fill.
* mapFill : Custom by-column overrides to 'blanket fill' na values on a per-column * basis. The categorical (string) fields are set via .setCategoricalNAFillMap while the * numeric fields are set via .setNumericNAFillMap.
* blanketFillAll : Fills all fields based on the values specified by * .setCharacterNABlanketFillValue and .setNumericNABlanketFillValue. All NA's for the * appropriate types will be filled in accordingly throughout all columns.
* blanketFillCharOnly Will use statistics to fill in numeric fields, but will replace * all categorical character fields na values with a blanket fill value.
* blanketFillNumOnly Will use statistics to fill in character fields, but will replace * all numeric fields na values with a blanket value. * * @throws IllegalArgumentException if the mods specified is not supported. * @param value String: Mode for NA Fill * @since 0.5.2 * @author Ben Wilson, Databricks */ @throws(classOf[IllegalArgumentException]) def setNAFillMode(value: String): this.type = { require( _allowableNAFillModes.contains(value), s"NA Fill Mode '$value' is not a supported mode. Must be one of:" + s"${_allowableNAFillModes.mkString(", ")}" ) _naFillMode = value setFillConfig() setConfigs() this } def setModelSelectionDistinctThreshold(value: Int): this.type = { _modelSelectionDistinctThreshold = value setFillConfig() setConfigs() this } def cardinalitySwitchOn(): this.type = { _cardinalitySwitchFlag = true setFillConfig() setConfigs() this } def cardinalitySwitchOff(): this.type = { _cardinalitySwitchFlag = false setFillConfig() setConfigs() this } def setCardinalitySwitch(value: Boolean): this.type = { _cardinalitySwitchFlag = value setFillConfig() setConfigs() this } @throws(classOf[AssertionError]) def setCardinalityType(value: String): this.type = { _cardinalityType = value assert( allowableCardinalilties.contains(value), s"Supplied CardinalityType '$value' is not in: " + s"${allowableCardinalilties.mkString(", ")}" ) setFillConfig() setConfigs() this } @throws(classOf[IllegalArgumentException]) def setCardinalityLimit(value: Int): this.type = { require(value > 0, s"Cardinality limit must be greater than 0") _cardinalityLimit = value setFillConfig() setConfigs() this } @throws(classOf[IllegalArgumentException]) def setCardinalityPrecision(value: Double): this.type = { require(value >= 0.0, s"Precision must be greater than or equal to 0.") require(value <= 1.0, s"Precision must be less than or equal to 1.") _cardinalityPrecision = value setFillConfig() setConfigs() this } @throws(classOf[AssertionError]) def setCardinalityCheckMode(value: String): this.type = { assert( allowableCategoricalFilterModes.contains(value), s"Supplied CardinalityCheckMode $value is not in: ${allowableCategoricalFilterModes.mkString(", ")}" ) _cardinalityCheckMode = value setFillConfig() setConfigs() this } private def setFillConfig(): this.type = { _fillConfig = FillConfig( numericFillStat = _numericFillStat, characterFillStat = _characterFillStat, modelSelectionDistinctThreshold = _modelSelectionDistinctThreshold, cardinalitySwitch = _cardinalitySwitchFlag, cardinalityType = _cardinalityType, cardinalityLimit = _cardinalityLimit, cardinalityPrecision = _cardinalityPrecision, cardinalityCheckMode = _cardinalityCheckMode, filterPrecision = _naFillFilterPrecision, categoricalNAFillMap = _categoricalNAFillMap, numericNAFillMap = _numericNAFillMap, characterNABlanketFillValue = _characterNABlanketFillValue, numericNABlanketFillValue = _numericNABlanketFillValue, naFillMode = _naFillMode ) this } def setFilterBounds(value: String): this.type = { _filterBounds = value setOutlierConfig() setConfigs() this } def setLowerFilterNTile(value: Double): this.type = { _lowerFilterNTile = value setOutlierConfig() setConfigs() this } def setUpperFilterNTile(value: Double): this.type = { _upperFilterNTile = value setOutlierConfig() setConfigs() this } def setFilterPrecision(value: Double): this.type = { _filterPrecision = value setOutlierConfig() setConfigs() this } def setContinuousDataThreshold(value: Int): this.type = { _continuousDataThreshold = value setOutlierConfig() setConfigs() this } def setFieldsToIgnore(value: Array[String]): this.type = { _fieldsToIgnore = value setOutlierConfig() setConfigs() this } private def setOutlierConfig(): this.type = { _outlierConfig = OutlierConfig( filterBounds = _filterBounds, lowerFilterNTile = _lowerFilterNTile, upperFilterNTile = _upperFilterNTile, filterPrecision = _filterPrecision, continuousDataThreshold = _continuousDataThreshold, fieldsToIgnore = _fieldsToIgnore ) this } def setPearsonFilterStatistic(value: String): this.type = { _pearsonFilterStatistic = value setPearsonConfig() setConfigs() this } def setPearsonFilterDirection(value: String): this.type = { _pearsonFilterDirection = value setPearsonConfig() setConfigs() this } def setPearsonFilterManualValue(value: Double): this.type = { _pearsonFilterManualValue = value setPearsonConfig() setConfigs() this } def setPearsonFilterMode(value: String): this.type = { _pearsonFilterMode = value setPearsonConfig() setConfigs() this } def setPearsonAutoFilterNTile(value: Double): this.type = { _pearsonAutoFilterNTile = value setPearsonConfig() setConfigs() this } private def setPearsonConfig(): this.type = { _pearsonConfig = PearsonConfig( filterStatistic = _pearsonFilterStatistic, filterDirection = _pearsonFilterDirection, filterManualValue = _pearsonFilterManualValue, filterMode = _pearsonFilterMode, autoFilterNTile = _pearsonAutoFilterNTile ) this } def setCorrelationCutoffLow(value: Double): this.type = { _correlationCutoffLow = value setCovarianceConfig() setConfigs() this } def setCorrelationCutoffHigh(value: Double): this.type = { _correlationCutoffHigh = value setCovarianceConfig() setConfigs() this } private def setCovarianceConfig(): this.type = { _covarianceConfig = CovarianceConfig( correlationCutoffLow = _correlationCutoffLow, correlationCutoffHigh = _correlationCutoffHigh ) this } def setScalerType(value: String): this.type = { _scalerType = value setScalerConfig() setConfigs() this } def setScalerMin(value: Double): this.type = { _scalerMin = value setScalerConfig() setConfigs() this } def setScalerMax(value: Double): this.type = { _scalerMax = value setScalerConfig() setConfigs() this } def setStandardScalerMeanFlagOn(): this.type = { _standardScalerMeanFlag = true setScalerConfig() setConfigs() this } def setStandardScalerMeanFlagOff(): this.type = { _standardScalerMeanFlag = false setScalerConfig() setConfigs() this } def setStandardScalerStdDevFlagOn(): this.type = { _standardScalerStdDevFlag = true setScalerConfig() setConfigs() this } def setStandardScalerStdDevFlagOff(): this.type = { _standardScalerStdDevFlag = false setScalerConfig() setConfigs() this } def setPNorm(value: Double): this.type = { _pNorm = value setScalerConfig() setConfigs() this } private def setScalerConfig(): this.type = { _scalingConfig = ScalingConfig( scalerType = _scalerType, scalerMin = _scalerMin, scalerMax = _scalerMax, standardScalerMeanFlag = _standardScalerMeanFlag, standardScalerStdDevFlag = _standardScalerStdDevFlag, pNorm = _pNorm ) this } /** * Setter for determining the mode of operation for inclusion of interacted features. * Modes are: * - all -> Includes all interactions between all features (after string indexing of categorical values) * - optimistic -> If the Information Gain / Variance, as compared to at least ONE of the parents of the interaction * is above the threshold set by featureInteractionTargetInteractionPercentage * (e.g. if IG of left parent is 0.5 and right parent is 0.9, with threshold set at 10, if the interaction * between these two parents has an IG of 0.42, it would be rejected, but if it was 0.46, it would be kept) * - strict -> the threshold percentage must be met for BOTH parents. * (in the above example, the IG for the interaction would have to be > 0.81 in order to be included in * the feature vector). * @param value String -> one of: 'all', 'optimistic', or 'strict' * @throws IllegalArgumentException if the specified value submitted is not permitted * @since 0.6.2 * @author Ben Wilson, Databricks */ @throws(classOf[IllegalArgumentException]) def setFeatureInteractionRetentionMode(value: String): this.type = { require( allowableFeatureInteractionModes.contains(value), s"FeatureInteractionRetentionMode is invalid. Must be one of: ${allowableFeatureInteractionModes .mkString(", ")}" ) _featureInteractionRetentionMode = value setFeatureInteractionConfig() setConfigs() this } /** * Setter for determining the behavior of continuous feature columns. In order to calculate Entropy for a continuous * variable, the distribution must be converted to nominal values for estimation of per-split information gain. * This setting defines how many nominal categorical values to create out of a continuously distributed feature * in order to calculate Entropy. * @param value Int -> must be greater than 1 * @throws IllegalArgumentException if the value specified is <= 1 * @since 0.6.2 * @author Ben Wilson, Databricks */ def setFeatureInteractionContinuousDiscretizerBucketCount( value: Int ): this.type = { require( value > 1, s"FeatureInteractionContinuousDiscretizerBucketCount must be greater than 1." ) _featureInteractionContinuousDiscretizerBucketCount = value setFeatureInteractionConfig() setConfigs() this } /** * Setter for configuring the concurrent count for scoring of feature interaction candidates. * Due to the nature of these operations, the configuration here may need to be set differently to that of * the modeling and general feature engineering phases of the toolkit. This is highly dependent on the row * count of the data set being submitted. * @param value Int -> must be greater than 0 * @since 0.6.2 * @author Ben Wilson, Databricks * @throws IllegalArgumentException if the value is < 1 */ @throws(classOf[IllegalArgumentException]) def setFeatureInteractionParallelism(value: Int): this.type = { require( value >= 1, s"FeatureInteractionParallelism must be set to a value >= 1." ) _featureInteractionParallelism = value setFeatureInteractionConfig() setConfigs() this } /** * Setter for establishing the minimum acceptable InformationGain or Variance allowed for an interaction * candidate based on comparison to the scores of its parents. * @param value Double in range of -inf -> inf * @since 0.6.2 * @author Ben Wilson, Databricks */ def setFeatureInteractionTargetInteractionPercentage( value: Double ): this.type = { _featureInteractionTargetInteractionPercentage = value setFeatureInteractionConfig() setConfigs() this } /** * Private setter for establishing the feature interaction configuration * @since 0.6.2 * @author Ben Wilson, Databricks */ private def setFeatureInteractionConfig(): this.type = { _featureInteractionConfig = FeatureInteractionConfig( retentionMode = _featureInteractionRetentionMode, continuousDiscretizerBucketCount = _featureInteractionContinuousDiscretizerBucketCount, parallelism = _featureInteractionParallelism, targetInteractionPercentage = _featureInteractionTargetInteractionPercentage ) this } def setParallelism(value: Int): this.type = { require( _parallelism < 100, s"Parallelism above 100 will result in cluster instability." ) _parallelism = value setGeneticConfig() setConfigs() this } def setKFold(value: Int): this.type = { _kFold = value setGeneticConfig() setConfigs() this } def setTrainPortion(value: Double): this.type = { _trainPortion = value setGeneticConfig() setConfigs() this } def setTrainSplitMethod(value: String): this.type = { require( trainSplitMethods.contains(value), s"TrainSplitMethod $value must be one of: ${trainSplitMethods.mkString(", ")}" ) _trainSplitMethod = value if (value == "chronological") println( "[WARNING] setTrainSplitMethod() -> Chronological splits is shuffle-intensive and will increase " + "runtime significantly. Only use if necessary for modeling scenario!" ) setGeneticConfig() setConfigs() this } def setKSampleConfig(): this.type = { _kSampleConfig = KSampleConfig( syntheticCol = _syntheticCol, kGroups = _kGroups, kMeansMaxIter = _kMeansMaxIter, kMeansTolerance = _kMeansTolerance, kMeansDistanceMeasurement = _kMeansDistanceMeasurement, kMeansSeed = _kMeansSeed, kMeansPredictionCol = _kMeansPredictionCol, lshHashTables = _lshHashTables, lshSeed = _lshSeed, lshOutputCol = _lshOutputCol, quorumCount = _quorumCount, minimumVectorCountToMutate = _minimumVectorCountToMutate, vectorMutationMethod = _vectorMutationMethod, mutationMode = _mutationMode, mutationValue = _mutationValue, labelBalanceMode = _labelBalanceMode, cardinalityThreshold = _cardinalityThreshold, numericRatio = _numericRatio, numericTarget = _numericTarget, outputDfRepartitionScaleFactor = _outputDfRepartitionScaleFactor ) this } /** * Setter - for setting the name of the Synthetic column name * * @param value String: A column name that is uniquely not part of the main DataFrame * @since 0.5.1 * @author Ben Wilson */ def setSyntheticCol(value: String): this.type = { _syntheticCol = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for specifying the number of K-Groups to generate in the KMeans model * * @param value Int: number of k groups to generate * @return this */ def setKGroups(value: Int): this.type = { _kGroups = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for specifying the maximum number of iterations for the KMeans model to go through to converge * * @param value Int: Maximum limit on iterations * @return this */ def setKMeansMaxIter(value: Int): this.type = { _kMeansMaxIter = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for Setting the tolerance for KMeans (must be >0) * * @param value The tolerance value setting for KMeans * @see reference: [[http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.clustering.KMeans]] * for further details. * @return this * @throws IllegalArgumentException() if a value less than 0 is entered */ @throws(classOf[IllegalArgumentException]) def setKMeansTolerance(value: Double): this.type = { require( value > 0, s"KMeans tolerance value ${value.toString} is out of range. Must be > 0." ) _kMeansTolerance = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for which distance measurement to use to calculate the nearness of vectors to a centroid * * @param value String: Options -> "euclidean" or "cosine" Default: "euclidean" * @return this * @throws IllegalArgumentException() if an invalid value is entered */ @throws(classOf[IllegalArgumentException]) def setKMeansDistanceMeasurement(value: String): this.type = { require( allowableKMeansDistanceMeasurements.contains(value), s"Kmeans Distance Measurement $value is not " + s"a valid mode of operation. Must be one of: ${allowableKMeansDistanceMeasurements.mkString(", ")}" ) _kMeansDistanceMeasurement = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for a KMeans seed for the clustering algorithm * * @param value Long: Seed value * @return this */ def setKMeansSeed(value: Long): this.type = { _kMeansSeed = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for the internal KMeans column for cluster membership attribution * * @param value String: column name for internal algorithm column for group membership * @return this */ def setKMeansPredictionCol(value: String): this.type = { _kMeansPredictionCol = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for Configuring the number of Hash Tables to use for MinHashLSH * * @param value Int: Count of hash tables to use * @see [[http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.MinHashLSH]] * for more information * @return this */ def setLSHHashTables(value: Int): this.type = { _lshHashTables = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for Configuring the Seed value for the LSH MinHash model * * @param value Long: A Seed value * @since 0.5.1 * @author Ben Wilson */ def setLSHSeed(value: Long): this.type = { _lshSeed = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for the internal LSH output hash information column * * @param value String: column name for the internal MinHashLSH Model transformation value * @return this */ def setLSHOutputCol(value: String): this.type = { _lshOutputCol = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for how many vectors to find in adjacency to the centroid for generation of synthetic data * * @note the higher the value set here, the higher the variance in synthetic data generation * @param value Int: Number of vectors to find nearest each centroid within the class * @return this */ def setQuorumCount(value: Int): this.type = { _quorumCount = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for minimum threshold for vector indexes to mutate within the feature vector. * * @note In vectorMutationMethod "fixed" this sets the fixed count of how many vector positions to mutate. * In vectorMutationMethod "random" this sets the lower threshold for 'at least this many indexes will * be mutated' * @param value The minimum (or fixed) number of indexes to mutate. * @return this */ def setMinimumVectorCountToMutate(value: Int): this.type = { _minimumVectorCountToMutate = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for the Vector Mutation Method * * @note Options: * "fixed" - will use the value of minimumVectorCountToMutate to select random indexes of this number of indexes. * "random" - will use this number as a lower bound on a random selection of indexes between this and the vector length. * "all" - will mutate all of the vectors. * @param value String - the mode to use. * @return this * @throws IllegalArgumentException() if the mode is not supported. */ @throws(classOf[IllegalArgumentException]) def setVectorMutationMethod(value: String): this.type = { require( allowableVectorMutationMethods.contains(value), s"Vector Mutation Mode $value is not supported. " + s"Must be one of: ${allowableVectorMutationMethods.mkString(", ")} " ) _vectorMutationMethod = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for the Mutation Mode of the feature vector individual values * * @note Options: * "weighted" - uses weighted averaging to scale the euclidean distance between the centroid vector and mutation candidate vectors * "random" - randomly selects a position on the euclidean vector between the centroid vector and the candidate mutation vectors * "ratio" - uses a ratio between the values of the centroid vector and the mutation vector * * @param value String: the mode to use. * @return this * @throws IllegalArgumentException() if the mode is not supported. */ @throws(classOf[IllegalArgumentException]) def setMutationMode(value: String): this.type = { require( allowableMutationModes.contains(value), s"Mutation Mode $value is not a valid mode of operation. " + s"Must be one of: ${allowableMutationModes.mkString(", ")}" ) _mutationMode = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter for specifying the mutation magnitude for the modes 'weighted' and 'ratio' in mutationMode * * @param value Double: value between 0 and 1 for mutation magnitude adjustment. * @note the higher this value, the closer to the centroid vector vs. the candidate mutation vector the synthetic row data will be. * @return this * @throws IllegalArgumentException() if the value specified is outside of the range (0, 1) */ @throws(classOf[IllegalArgumentException]) def setMutationValue(value: Double): this.type = { require( value > 0 & value < 1, s"Mutation Value must be between 0 and 1. Value $value is not permitted." ) _mutationValue = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter - for determining the label balance approach mode. * * @note Available modes:
* 'match': Will match all smaller class counts to largest class count. [WARNING] - May significantly increase memory pressure!
* 'percentage' Will adjust smaller classes to a percentage value of the largest class count. * 'target' Will increase smaller class counts to a fixed numeric target of rows. * @param value String: one of: 'match', 'percentage' or 'target' * @note Default: "percentage" * @since 0.5.1 * @author Ben Wilson * @throws UnsupportedOperationException() if the provided mode is not supported. */ @throws(classOf[UnsupportedOperationException]) def setLabelBalanceMode(value: String): this.type = { require( allowableLabelBalanceModes.contains(value), s"Label Balance Mode $value is not supported." + s"Must be one of: ${allowableLabelBalanceModes.mkString(", ")}" ) _labelBalanceMode = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter - for overriding the cardinality threshold exception threshold. [WARNING] increasing this value on * a sufficiently large data set could incur, during runtime, excessive memory and cpu pressure on the cluster. * * @param value Int: the limit above which an exception will be thrown for a classification problem wherein the * label distinct count is too large to successfully generate synthetic data. * @note Default: 20 * @since 0.5.1 * @author Ben Wilson */ def setCardinalityThreshold(value: Int): this.type = { _cardinalityThreshold = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter - for specifying the percentage ratio for the mode 'percentage' in setLabelBalanceMode() * * @param value Double: A fractional double in the range of 0.0 to 1.0. * @note Setting this value to 1.0 is equivalent to setting the label balance mode to 'match' * @note Default: 0.2 * @since 0.5.1 * @author Ben Wilson * @throws UnsupportedOperationException() if the provided value is outside of the range of 0.0 -> 1.0 */ @throws(classOf[UnsupportedOperationException]) def setNumericRatio(value: Double): this.type = { require( value <= 1.0 & value > 0.0, s"Invalid Numeric Ratio entered! Must be between 0 and 1." + s"${value.toString} is not valid." ) _numericRatio = value setKSampleConfig() setGeneticConfig() setConfigs() this } /** * Setter - for specifying the target row count to generate for 'target' mode in setLabelBalanceMode() * * @param value Int: The desired final number of rows per minority class label * @note [WARNING] Setting this value to too high of a number will greatly increase runtime and memory pressure. * @since 0.5.1 * @author Ben Wilson */ def setNumericTarget(value: Int): this.type = { _numericTarget = value setKSampleConfig() setGeneticConfig() setConfigs() this } def setTrainSplitChronologicalColumn(value: String): this.type = { _trainSplitChronologicalColumn = value val ignoredFields: Array[String] = _fieldsToIgnoreInVector ++ Array(value) setFieldsToIgnoreInVector(ignoredFields) _trainSplitColumnSet = true setGeneticConfig() setConfigs() this } def setTrainSplitChronologicalRandomPercentage(value: Double): this.type = { _trainSplitChronologicalRandomPercentage = value if (value > 10) println( "[WARNING] setTrainSplitChronologicalRandomPercentage() setting this value above 10 " + "percent will cause significant per-run train/test skew and variability in row counts during training. " + "Use higher values only if this is desired." ) setGeneticConfig() setConfigs() this } def setSeed(value: Long): this.type = { _seed = value setGeneticConfig() setConfigs() this } def setFirstGenerationGenePool(value: Int): this.type = { _firstGenerationGenePool = value setGeneticConfig() setConfigs() this } def setNumberOfGenerations(value: Int): this.type = { _numberOfGenerations = value setGeneticConfig() setConfigs() this } def setNumberOfParentsToRetain(value: Int): this.type = { _numberOfParentsToRetain = value setGeneticConfig() setConfigs() this } def setNumberOfMutationsPerGeneration(value: Int): this.type = { _numberOfMutationsPerGeneration = value setGeneticConfig() setConfigs() this } def setGeneticMixing(value: Double): this.type = { _geneticMixing = value setGeneticConfig() setConfigs() this } def setGenerationalMutationStrategy(value: String): this.type = { _generationalMutationStrategy = value setGeneticConfig() setConfigs() this } def setFixedMutationValue(value: Int): this.type = { _fixedMutationValue = value setGeneticConfig() setConfigs() this } def setMutationMagnitudeMode(value: String): this.type = { _mutationMagnitudeMode = value setGeneticConfig() setConfigs() this } def setModelSeedString(value: String): this.type = { _modelSeedMap = extractGenericModelReturnMap(value) _modelSeedSetStatus = true setGeneticConfig() setConfigs() this } def setModelSeedMap(value: Map[String, Any]): this.type = { _modelSeedMap = value _modelSeedSetStatus = true setGeneticConfig() setConfigs() this } private def setFirstGenerationConfig(): this.type = { _firstGenerationConfig = FirstGenerationConfig( permutationCount = _firstGenerationPermutationCount, indexMixingMode = _firstGenerationIndexMixingMode, arraySeed = _firstGenerationArraySeed ) setGeneticConfig() setConfigs() this } def setFirstGenerationPermutationCount(value: Int): this.type = { _firstGenerationPermutationCount = value setFirstGenerationConfig() this } def setFirstGenerationIndexMixingMode(value: String): this.type = { require( _allowableInitialGenerationIndexMixingModes.contains(value), s"Invalid First Generation Index Mixing " + s"Mode: $value . First Generation Index Mixing Mode must be one of: " + s"${_allowableInitialGenerationIndexMixingModes.mkString(", ")}" ) _firstGenerationIndexMixingMode = value setFirstGenerationConfig() this } def setFirstGenerationArraySeed(value: Long): this.type = { _firstGenerationArraySeed = value setFirstGenerationConfig() this } def hyperSpaceInferenceOn(): this.type = { _hyperSpaceInference = true setGeneticConfig() setConfigs() this } def hyperSpaceInferenceOff(): this.type = { _hyperSpaceInference = false setGeneticConfig() setConfigs() this } def setHyperSpaceInferenceCount(value: Int): this.type = { if (value > 500000) println( "WARNING! Setting permutation counts above 500,000 will put stress on the driver." ) if (value > 1000000) throw new UnsupportedOperationException( s"Setting permutation above 1,000,000 is not supported" + s" due to runtime considerations. $value is too large of a value." ) _hyperSpaceInferenceCount = value setGeneticConfig() setConfigs() this } def setHyperSpaceModelType(value: String): this.type = { require( Array("RandomForest", "LinearRegression", "XGBoost").contains(value), s"Model type $value is not supported for post " + s"modeling hyper space optimization! Please choose either RandomForest or LinearRegression" ) _hyperSpaceModelType = value setGeneticConfig() setConfigs() this } def setHyperSpaceModelCount(value: Int): this.type = { if (value > 50) println( "WARNING! Setting this value above 50 will incur 50 additional models to be built. Proceed" + "only if this is intended." ) _hyperSpaceModelCount = value setGeneticConfig() setConfigs() this } def setFirstGenerationMode(value: String): this.type = { require( _allowableInitialGenerationModes.contains(value), s"Invalid First Generation Mode: $value . " + s"First Generation Mode must be one of : ${_allowableInitialGenerationModes.mkString(", ")}" ) _firstGenerationMode = value setGeneticConfig() setConfigs() this } def setMlFlowConfig(value: MLFlowConfig): this.type = { _mlFlowConfig = value setConfigs() this } def mlFlowLoggingOn(): this.type = { _mlFlowLoggingFlag = true setConfigs() this } def mlFlowLoggingOff(): this.type = { _mlFlowLoggingFlag = false setConfigs() this } def mlFlowLogArtifactsOn(): this.type = { _mlFlowArtifactsFlag = true setConfigs() this } def mlFlowLogArtifactsOff(): this.type = { _mlFlowArtifactsFlag = false setConfigs() this } def setMlFlowTrackingURI(value: String): this.type = { _mlFlowTrackingURI = value setMlFlowConfig() setConfigs() this } def setMlFlowExperimentName(value: String): this.type = { _mlFlowExperimentName = value setMlFlowConfig() setConfigs() this } def setMlFlowAPIToken(value: String): this.type = { _mlFlowAPIToken = value setMlFlowConfig() setConfigs() this } @throws(classOf[IllegalArgumentException]) def setMlFlowModelSaveDirectory(value: String): this.type = { require( value.take(6) == "dbfs:/", s"Model save directory must be written to dbfs:/." ) _mlFlowModelSaveDirectory = value setMlFlowConfig() setConfigs() this } def setMlFlowLoggingMode(value: String): this.type = { require( _allowableMlFlowLoggingModes.contains(value), s"MlFlow logging mode $value is not permitted. Must be " + s"one of: ${_allowableMlFlowLoggingModes.mkString(",")}" ) _mlFlowLoggingMode = value setMlFlowConfig() setConfigs() this } def setMlFlowBestSuffix(value: String): this.type = { _mlFlowBestSuffix = value setMlFlowConfig() setConfigs() this } def setMlFlowCustomRunTags(value: Map[String, String]): this.type = { _mlFlowCustomRunTags = value setMlFlowConfig() setConfigs() this } private def setMlFlowConfig(): this.type = { _mlFlowConfig = MLFlowConfig( mlFlowTrackingURI = _mlFlowTrackingURI, mlFlowExperimentName = _mlFlowExperimentName, mlFlowAPIToken = _mlFlowAPIToken, mlFlowModelSaveDirectory = _mlFlowModelSaveDirectory, mlFlowLoggingMode = _mlFlowLoggingMode, mlFlowBestSuffix = _mlFlowBestSuffix, mlFlowCustomRunTags = _mlFlowCustomRunTags ) this } def autoStoppingOn(): this.type = { _autoStoppingFlag = true setConfigs() this } def autoStoppingOff(): this.type = { _autoStoppingFlag = false setConfigs() this } def setAutoStoppingScore(value: Double): this.type = { _autoStoppingScore = value setConfigs() this } /** * Setter for defining the secondary stopping criteria for continuous training mode ( number of consistentlt * not-improving runs to terminate the learning algorithm due to diminishing returns. * @param value Negative Integer (an improvement to a priori will reset the counter and subsequent non-improvements * will decrement a mutable counter. If the counter hits this limit specified in value, the continuous * mode algorithm will stop). * @author Ben Wilson, Databricks * @since 0.6.0 * @throws IllegalArgumentException if the value is positive. */ @throws(classOf[IllegalArgumentException]) def setContinuousEvolutionImprovementThreshold(value: Int): this.type = { require( value < 0, s"ContinuousEvolutionImprovementThreshold must be less than zero. It is " + s"recommended to set this value to less than -4." ) _continuousEvolutionImprovementThreshold = value setGeneticConfig() setConfigs() this } /** * Setter for selecting the type of Regressor to use for the within-epoch generation MBO of candidates * @param value String - one of "XGBoost", "LinearRegression" or "RandomForest" * @author Ben Wilson, Databricks * @since 0.6.0 * @throws IllegalArgumentException if the value is not supported */ @throws(classOf[IllegalArgumentException]) def setGeneticMBORegressorType(value: String): this.type = { require( allowableMBORegressorTypes.contains(value), s"GeneticRegressorType $value is not a supported Regressor " + s"Type. Must be one of: ${allowableMBORegressorTypes.mkString(", ")}" ) _geneticMBORegressorType = value setGeneticConfig() setConfigs() this } /** * Setter for defining the factor to be applied to the candidate listing of hyperparameters to generate through * mutation for each generation other than the initial and post-modeling optimization phases. The larger this * value (default: 10), the more potential space can be searched. There is not a large performance hit to this, * and as such, values in excess of 100 are viable. * @param value Int - a factor to multiply the numberOfMutationsPerGeneration by to generate a count of potential * candidates. * @author Ben Wilson, Databricks * @since 0.6.0 * @throws IllegalArgumentException if the value is not greater than zero. */ @throws(classOf[IllegalArgumentException]) def setGeneticMBOCandidateFactor(value: Int): this.type = { require(value > 0, s"GeneticMBOCandidateFactor must be greater than zero.") _geneticMBOCandidateFactor = value setGeneticConfig() setConfigs() this } def setFeatureImportanceCutoffType(value: String): this.type = { require( _supportedFeatureImportanceCutoffTypes.contains(value), s"Feature Importance Cutoff Type '$value' is not supported. Allowable values: " + s"${_supportedFeatureImportanceCutoffTypes.mkString(" ,")}" ) _featureImportanceCutoffType = value setConfigs() this } def setFeatureImportanceCutoffValue(value: Double): this.type = { _featureImportanceCutoffValue = value setConfigs() this } def setEvolutionStrategy(value: String): this.type = { require( _allowableEvolutionStrategies.contains(value), s"Evolution Strategy '$value' is not a supported mode. Must be one of: ${_allowableEvolutionStrategies .mkString(", ")}" ) _evolutionStrategy = value setGeneticConfig() setConfigs() this } def setContinuousEvolutionMaxIterations(value: Int): this.type = { if (value > 500) println( s"[WARNING] Total Modeling count $value is higher than recommended limit of 500. " + s"This tuning will take a long time to run." ) _continuousEvolutionMaxIterations = value setGeneticConfig() setConfigs() this } def setContinuousEvolutionStoppingScore(value: Double): this.type = { _continuousEvolutionStoppingScore = value setGeneticConfig() setConfigs() this } def setContinuousEvolutionParallelism(value: Int): this.type = { if (value > 10) println( s"[WARNING] ContinuousEvolutionParallelism -> $value is higher than recommended " + s"concurrency for efficient optimization for convergence." + s"\n Setting this value below 11 will converge faster in most cases." ) _continuousEvolutionParallelism = value setGeneticConfig() setConfigs() this } def setContinuousEvolutionMutationAggressiveness(value: Int): this.type = { if (value > 4) println( s"[WARNING] ContinuousEvolutionMutationAggressiveness -> $value. " + s"\n Setting this higher than 4 will result in extensive random search and will take longer to converge " + s"to optimal hyperparameters." ) _continuousEvolutionMutationAggressiveness = value setGeneticConfig() setConfigs() this } def setContinuousEvolutionGeneticMixing(value: Double): this.type = { require( value < 1.0 & value > 0.0, s"Mutation Aggressiveness must be in range (0,1). Current Setting of $value is not permitted." ) _continuousEvolutionGeneticMixing = value setGeneticConfig() setConfigs() this } def setContinuousEvolutionRollingImprovementCount(value: Int): this.type = { require( value > 0, s"ContinuousEvolutionRollingImprovementCount must be > 0. $value is invalid." ) if (value < 10) println( s"[WARNING] ContinuousEvolutionRollingImprovementCount -> $value setting is low. " + s"Optimal Convergence may not occur due to early stopping." ) _continuousEvolutionRollingImprovementCount = value setGeneticConfig() setConfigs() this } @throws(classOf[IllegalArgumentException]) def setInferenceConfigSaveLocation(value: String): this.type = { require( value.take(6) == "dbfs:/", s"Inference save location must be on dbfs:/." ) _inferenceConfigSaveLocation = value setConfigs() this } def setDataReductionFactor(value: Double): this.type = { require(value > 0, s"Data Reduction Factor must be between 0 and 1") require(value < 1, s"Data Reduction Factor must be between 0 and 1") _dataReductionFactor = value setConfigs() this } /** * Setter for providing a path to write the kfold train/test splits as Delta data sets to (useful for extremely * large data sets or a situation where using local disk storage might be prohibitively expensive) * @param value String path to a dbfs location for creating the temporary (or persisted) * @since 0.7.1 * @author Ben Wilson, Databricks */ def setDeltaCacheBackingDirectory(value: String): this.type = { if (value != "") { require( value.take(6) == "dbfs:/", s"Delta backing location must be written to dbfs." ) } _deltaCacheBackingDirectory = value setGeneticConfig() setConfigs() this } /** * Setter for determining the split caching strategy (either persist to disk for each kfold split or backing to Delta) * @param value Configuration string either 'persist' or 'delta' * @since 0.7.1 * @author Ben Wilson, Databricks */ def setSplitCachingStrategy(value: String): this.type = { val valueSet = value.toLowerCase require( valueSet == "persist" || valueSet == "delta" || valueSet == "cache", s"SplitCachingStrategy '${}' is invalid. Must be either 'delta', 'cache', or 'persist'" ) _splitCachingStrategy = valueSet setGeneticConfig() setConfigs() this } /** * Setter for whether or not to delete the written train/test splits for the run in Delta. Defaulted to true * which means that the job will delete the data on Object store to clean itself up after the run is completed * if the splitCachingStrategy is set to 'delta' * @param value Boolean - true => delete false => leave on Object Store * @since 0.7.1 * @author Ben Wilson, Databricks */ def setDeltaCacheBackingDirectoryRemovalFlag(value: Boolean): this.type = { _deltaCacheBackingDirectoryRemovalFlag = value setGeneticConfig() setConfigs() this } def deltaCheckBackingDirectoryRemovalOn(): this.type = { _deltaCacheBackingDirectoryRemovalFlag = true setGeneticConfig() setConfigs() this } def deltaCheckBackingDirectoryRemovalOff(): this.type = { _deltaCacheBackingDirectoryRemovalFlag = false setGeneticConfig() setConfigs() this } private def setGeneticConfig(): this.type = { _geneticConfig = GeneticConfig( parallelism = _parallelism, kFold = _kFold, trainPortion = _trainPortion, trainSplitMethod = _trainSplitMethod, kSampleConfig = _kSampleConfig, trainSplitChronologicalColumn = _trainSplitChronologicalColumn, trainSplitChronologicalRandomPercentage = _trainSplitChronologicalRandomPercentage, seed = _seed, firstGenerationGenePool = _firstGenerationGenePool, numberOfGenerations = _numberOfGenerations, numberOfParentsToRetain = _numberOfParentsToRetain, numberOfMutationsPerGeneration = _numberOfMutationsPerGeneration, geneticMixing = _geneticMixing, generationalMutationStrategy = _generationalMutationStrategy, fixedMutationValue = _fixedMutationValue, mutationMagnitudeMode = _mutationMagnitudeMode, evolutionStrategy = _evolutionStrategy, geneticMBORegressorType = _geneticMBORegressorType, geneticMBOCandidateFactor = _geneticMBOCandidateFactor, continuousEvolutionMaxIterations = _continuousEvolutionMaxIterations, continuousEvolutionStoppingScore = _continuousEvolutionStoppingScore, continuousEvolutionImprovementThreshold = _continuousEvolutionImprovementThreshold, continuousEvolutionParallelism = _continuousEvolutionParallelism, continuousEvolutionMutationAggressiveness = _continuousEvolutionMutationAggressiveness, continuousEvolutionGeneticMixing = _continuousEvolutionGeneticMixing, continuousEvolutionRollingImprovementCount = _continuousEvolutionRollingImprovementCount, modelSeed = _modelSeedMap, hyperSpaceInference = _hyperSpaceInference, hyperSpaceInferenceCount = _hyperSpaceInferenceCount, hyperSpaceModelType = _hyperSpaceModelType, hyperSpaceModelCount = _hyperSpaceModelCount, initialGenerationMode = _firstGenerationMode, initialGenerationConfig = _firstGenerationConfig, deltaCacheBackingDirectory = _deltaCacheBackingDirectory, splitCachingStrategy = _splitCachingStrategy, deltaCacheBackingDirectoryRemovalFlag = _deltaCacheBackingDirectoryRemovalFlag ) this } def setMainConfig(): this.type = { _mainConfig = MainConfig( modelFamily = _modelingFamily, labelCol = _labelCol, featuresCol = _featuresCol, naFillFlag = _naFillFlag, varianceFilterFlag = _varianceFilterFlag, outlierFilterFlag = _outlierFilterFlag, pearsonFilteringFlag = _pearsonFilterFlag, covarianceFilteringFlag = _covarianceFilterFlag, oneHotEncodeFlag = _oneHotEncodeFlag, scalingFlag = _scalingFlag, featureInteractionFlag = _featureInteractionFlag, dataPrepCachingFlag = _dataPrepCachingFlag, dataPrepParallelism = _dataPrepParallelism, autoStoppingFlag = _autoStoppingFlag, autoStoppingScore = _autoStoppingScore, featureImportanceCutoffType = _featureImportanceCutoffType, featureImportanceCutoffValue = _featureImportanceCutoffValue, dateTimeConversionType = _dateTimeConversionType, fieldsToIgnoreInVector = _fieldsToIgnoreInVector, numericBoundaries = _numericBoundaries, stringBoundaries = _stringBoundaries, scoringMetric = _scoringMetric, scoringOptimizationStrategy = _scoringOptimizationStrategy, fillConfig = _fillConfig, outlierConfig = _outlierConfig, pearsonConfig = _pearsonConfig, covarianceConfig = _covarianceConfig, scalingConfig = _scalingConfig, featureInteractionConfig = _featureInteractionConfig, geneticConfig = _geneticConfig, mlFlowLoggingFlag = _mlFlowLoggingFlag, mlFlowLogArtifactsFlag = _mlFlowArtifactsFlag, mlFlowConfig = _mlFlowConfig, inferenceConfigSaveLocation = _inferenceConfigSaveLocation, dataReductionFactor = _dataReductionFactor, pipelineDebugFlag = _pipelineDebugFlag, pipelineId = _pipelineId ) this } private def setFillConfig(config: FillConfig): this.type = { _fillConfig = config _numericFillStat = config.numericFillStat _characterFillStat = config.characterFillStat _modelSelectionDistinctThreshold = config.modelSelectionDistinctThreshold _cardinalitySwitchFlag = config.cardinalitySwitch _cardinalityType = config.cardinalityType _cardinalityLimit = config.cardinalityLimit _cardinalityPrecision = config.cardinalityPrecision _cardinalityCheckMode = config.cardinalityCheckMode _naFillFilterPrecision = config.filterPrecision _categoricalNAFillMap = config.categoricalNAFillMap _numericNAFillMap = config.numericNAFillMap _characterNABlanketFillValue = config.characterNABlanketFillValue _numericNABlanketFillValue = config.numericNABlanketFillValue _naFillMode = config.naFillMode this } private def setOutlierConfig(config: OutlierConfig): this.type = { _outlierConfig = config _filterBounds = config.filterBounds _lowerFilterNTile = config.lowerFilterNTile _upperFilterNTile = config.upperFilterNTile _filterPrecision = config.filterPrecision _continuousDataThreshold = config.continuousDataThreshold _fieldsToIgnore = config.fieldsToIgnore this } private def setPearsonConfig(config: PearsonConfig): this.type = { _pearsonConfig = config _pearsonFilterStatistic = config.filterStatistic _pearsonFilterDirection = config.filterDirection _pearsonFilterManualValue = config.filterManualValue _pearsonFilterMode = config.filterMode _pearsonAutoFilterNTile = config.autoFilterNTile this } private def setCovarianceConfig(config: CovarianceConfig): this.type = { _covarianceConfig = config _correlationCutoffLow = config.correlationCutoffLow _correlationCutoffHigh = config.correlationCutoffHigh this } private def setScalerConfig(config: ScalingConfig): this.type = { _scalingConfig = config _scalerType = config.scalerType _scalerMin = config.scalerMin _scalerMax = config.scalerMax _standardScalerMeanFlag = config.standardScalerMeanFlag _standardScalerStdDevFlag = config.standardScalerStdDevFlag _pNorm = config.pNorm this } private def setFeatureInteractionConfig( config: FeatureInteractionConfig ): this.type = { _featureInteractionConfig = config _featureInteractionRetentionMode = config.retentionMode _featureInteractionContinuousDiscretizerBucketCount = config.continuousDiscretizerBucketCount _featureInteractionParallelism = config.parallelism _featureInteractionTargetInteractionPercentage = config.targetInteractionPercentage this } private def setKSampleConfig(config: KSampleConfig): this.type = { _kSampleConfig = config _syntheticCol = config.syntheticCol _kGroups = config.kGroups _kMeansMaxIter = config.kMeansMaxIter _kMeansTolerance = config.kMeansTolerance _kMeansDistanceMeasurement = config.kMeansDistanceMeasurement _kMeansSeed = config.kMeansSeed _kMeansPredictionCol = config.kMeansPredictionCol _lshHashTables = config.lshHashTables _lshSeed = config.lshSeed _lshOutputCol = config.lshOutputCol _quorumCount = config.quorumCount _minimumVectorCountToMutate = config.minimumVectorCountToMutate _vectorMutationMethod = config.vectorMutationMethod _mutationMode = config.mutationMode _mutationValue = config.mutationValue _labelBalanceMode = config.labelBalanceMode _cardinalityThreshold = config.cardinalityThreshold _numericRatio = config.numericRatio _numericTarget = config.numericTarget _outputDfRepartitionScaleFactor = config.outputDfRepartitionScaleFactor this } private def setFirstGenerationConfig( config: FirstGenerationConfig ): this.type = { _firstGenerationConfig = config _firstGenerationPermutationCount = config.permutationCount _firstGenerationIndexMixingMode = config.indexMixingMode _firstGenerationArraySeed = config.arraySeed this } private def setGeneticConfig(config: GeneticConfig): this.type = { _geneticConfig = config _parallelism = config.parallelism _kFold = config.kFold _trainPortion = config.trainPortion _trainSplitMethod = config.trainSplitMethod setKSampleConfig(config.kSampleConfig) _trainSplitChronologicalColumn = config.trainSplitChronologicalColumn _trainSplitChronologicalRandomPercentage = config.trainSplitChronologicalRandomPercentage _seed = config.seed _firstGenerationGenePool = config.firstGenerationGenePool _numberOfGenerations = config.numberOfGenerations _numberOfParentsToRetain = config.numberOfParentsToRetain _numberOfMutationsPerGeneration = config.numberOfMutationsPerGeneration _geneticMixing = config.geneticMixing _generationalMutationStrategy = config.generationalMutationStrategy _fixedMutationValue = config.fixedMutationValue _mutationMagnitudeMode = config.mutationMagnitudeMode _evolutionStrategy = config.evolutionStrategy _continuousEvolutionMaxIterations = config.continuousEvolutionMaxIterations _continuousEvolutionStoppingScore = config.continuousEvolutionStoppingScore _continuousEvolutionParallelism = config.continuousEvolutionParallelism _continuousEvolutionMutationAggressiveness = config.continuousEvolutionMutationAggressiveness _continuousEvolutionGeneticMixing = config.continuousEvolutionGeneticMixing _continuousEvolutionRollingImprovementCount = config.continuousEvolutionRollingImprovementCount _modelSeedMap = config.modelSeed _hyperSpaceInference = config.hyperSpaceInference _hyperSpaceInferenceCount = config.hyperSpaceInferenceCount _hyperSpaceModelType = config.hyperSpaceModelType _hyperSpaceModelCount = config.hyperSpaceModelCount _firstGenerationMode = config.initialGenerationMode _continuousEvolutionImprovementThreshold = config.continuousEvolutionImprovementThreshold _geneticMBORegressorType = config.geneticMBORegressorType _geneticMBOCandidateFactor = config.geneticMBOCandidateFactor setFirstGenerationConfig(config.initialGenerationConfig) _deltaCacheBackingDirectoryRemovalFlag = config.deltaCacheBackingDirectoryRemovalFlag _deltaCacheBackingDirectory = config.deltaCacheBackingDirectory _splitCachingStrategy = config.splitCachingStrategy this } private def resetMlFlowConfig(config: MLFlowConfig): this.type = { _mlFlowConfig = config _mlFlowTrackingURI = config.mlFlowTrackingURI _mlFlowExperimentName = config.mlFlowExperimentName _mlFlowAPIToken = config.mlFlowAPIToken _mlFlowModelSaveDirectory = config.mlFlowModelSaveDirectory _mlFlowLoggingMode = config.mlFlowLoggingMode _mlFlowBestSuffix = config.mlFlowBestSuffix _mlFlowCustomRunTags = config.mlFlowCustomRunTags this } def setMainConfig(value: MainConfig): this.type = { _mainConfig = value /** * Reset all of the local var's so that setters can be used in a chained manner without reverting to defaults. */ _modelingFamily = value.modelFamily _labelCol = value.labelCol _featuresCol = value.featuresCol _naFillFlag = value.naFillFlag _varianceFilterFlag = value.varianceFilterFlag _outlierFilterFlag = value.outlierFilterFlag _pearsonFilterFlag = value.pearsonFilteringFlag _covarianceFilterFlag = value.covarianceFilteringFlag _oneHotEncodeFlag = value.oneHotEncodeFlag _scalingFlag = value.scalingFlag _featureInteractionFlag = value.featureInteractionFlag _dataPrepCachingFlag = value.dataPrepCachingFlag _dataPrepParallelism = value.dataPrepParallelism _autoStoppingFlag = value.autoStoppingFlag _autoStoppingScore = value.autoStoppingScore _featureImportanceCutoffType = value.featureImportanceCutoffType _featureImportanceCutoffValue = value.featureImportanceCutoffValue _dateTimeConversionType = value.dateTimeConversionType _fieldsToIgnoreInVector = value.fieldsToIgnoreInVector _numericBoundaries = value.numericBoundaries _stringBoundaries = value.stringBoundaries _scoringMetric = value.scoringMetric _scoringOptimizationStrategy = value.scoringOptimizationStrategy setFillConfig(value.fillConfig) setOutlierConfig(value.outlierConfig) setPearsonConfig(value.pearsonConfig) setCovarianceConfig(value.covarianceConfig) setScalerConfig(value.scalingConfig) setFeatureInteractionConfig(value.featureInteractionConfig) setGeneticConfig(value.geneticConfig) _mlFlowLoggingFlag = value.mlFlowLoggingFlag _mlFlowArtifactsFlag = value.mlFlowLogArtifactsFlag resetMlFlowConfig(value.mlFlowConfig) _inferenceConfigSaveLocation = value.inferenceConfigSaveLocation _dataReductionFactor = value.dataReductionFactor _pipelineDebugFlag = value.pipelineDebugFlag _pipelineId = value.pipelineId this } def setFeatConfig(): this.type = { _featureImportancesConfig = MainConfig( modelFamily = "RandomForest", labelCol = _labelCol, featuresCol = _featuresCol, naFillFlag = _naFillFlag, varianceFilterFlag = _varianceFilterFlag, outlierFilterFlag = _outlierFilterFlag, pearsonFilteringFlag = _pearsonFilterFlag, covarianceFilteringFlag = _covarianceFilterFlag, oneHotEncodeFlag = _oneHotEncodeFlag, scalingFlag = _scalingFlag, featureInteractionFlag = _featureInteractionFlag, dataPrepCachingFlag = _dataPrepCachingFlag, dataPrepParallelism = _dataPrepParallelism, autoStoppingFlag = _autoStoppingFlag, autoStoppingScore = _autoStoppingScore, featureImportanceCutoffType = _featureImportanceCutoffType, featureImportanceCutoffValue = _featureImportanceCutoffValue, dateTimeConversionType = _dateTimeConversionType, fieldsToIgnoreInVector = _fieldsToIgnoreInVector, numericBoundaries = _numericBoundaries, stringBoundaries = _stringBoundaries, scoringMetric = _scoringMetric, scoringOptimizationStrategy = _scoringOptimizationStrategy, fillConfig = _fillConfig, outlierConfig = _outlierConfig, pearsonConfig = _pearsonConfig, covarianceConfig = _covarianceConfig, scalingConfig = _scalingConfig, featureInteractionConfig = _featureInteractionConfig, geneticConfig = _geneticConfig, mlFlowLoggingFlag = _mlFlowLoggingFlag, mlFlowLogArtifactsFlag = _mlFlowArtifactsFlag, mlFlowConfig = _mlFlowConfig, inferenceConfigSaveLocation = _inferenceConfigSaveLocation, dataReductionFactor = _dataReductionFactor, pipelineDebugFlag = _pipelineDebugFlag, pipelineId = _pipelineId ) this } def setFeatConfig(value: MainConfig): this.type = { _featureImportancesConfig = value require( value.modelFamily == "RandomForest", s"Model Family for Feature Importances must be 'RandomForest'. ${value.modelFamily} is not supported." ) setConfigs() this } def setTreeSplitsConfig(): this.type = { _treeSplitsConfig = MainConfig( modelFamily = "Trees", labelCol = _labelCol, featuresCol = _featuresCol, naFillFlag = _naFillFlag, varianceFilterFlag = _varianceFilterFlag, outlierFilterFlag = _outlierFilterFlag, pearsonFilteringFlag = _pearsonFilterFlag, covarianceFilteringFlag = _covarianceFilterFlag, oneHotEncodeFlag = _oneHotEncodeFlag, scalingFlag = _scalingFlag, featureInteractionFlag = _featureInteractionFlag, dataPrepCachingFlag = _dataPrepCachingFlag, dataPrepParallelism = _dataPrepParallelism, autoStoppingFlag = _autoStoppingFlag, autoStoppingScore = _autoStoppingScore, featureImportanceCutoffType = _featureImportanceCutoffType, featureImportanceCutoffValue = _featureImportanceCutoffValue, dateTimeConversionType = _dateTimeConversionType, fieldsToIgnoreInVector = _fieldsToIgnoreInVector, numericBoundaries = _numericBoundaries, stringBoundaries = _stringBoundaries, scoringMetric = _scoringMetric, scoringOptimizationStrategy = _scoringOptimizationStrategy, fillConfig = _fillConfig, outlierConfig = _outlierConfig, pearsonConfig = _pearsonConfig, covarianceConfig = _covarianceConfig, scalingConfig = _scalingConfig, featureInteractionConfig = _featureInteractionConfig, geneticConfig = _geneticConfig, mlFlowLoggingFlag = _mlFlowLoggingFlag, mlFlowLogArtifactsFlag = _mlFlowArtifactsFlag, mlFlowConfig = _mlFlowConfig, inferenceConfigSaveLocation = _inferenceConfigSaveLocation, dataReductionFactor = _dataReductionFactor, pipelineDebugFlag = _pipelineDebugFlag, pipelineId = _pipelineId ) this } def setTreeSplitsConfig(value: MainConfig): this.type = { _treeSplitsConfig = value require( value.modelFamily == "Trees", s"Model Family for Trees Splits must be 'Trees'. ${value.modelFamily} is not supported." ) setConfigs() this } def getPipelineId: String = _mainConfig.pipelineId def getModelingFamily: String = _modelingFamily def getLabelCol: String = _labelCol def getFeaturesCol: String = _featuresCol def getNaFillStatus: Boolean = _naFillFlag def getVarianceFilterStatus: Boolean = _varianceFilterFlag def getOutlierFilterStatus: Boolean = _outlierFilterFlag def getPearsonFilterStatus: Boolean = _pearsonFilterFlag def getCovarianceFilterStatus: Boolean = _covarianceFilterFlag def getOneHotEncodingStatus: Boolean = _oneHotEncodeFlag def getScalingStatus: Boolean = _scalingFlag def getFeatureInteractionStatus: Boolean = _featureInteractionFlag def getDataPrepCachingStatus: Boolean = _dataPrepCachingFlag def getDataPrepParallelism: Int = _dataPrepParallelism def getNumericBoundaries: Map[String, (Double, Double)] = _numericBoundaries def getStringBoundaries: Map[String, List[String]] = _stringBoundaries def getScoringMetric: String = _scoringMetric def getScoringOptimizationStrategy: String = _scoringOptimizationStrategy def getNumericFillStat: String = _numericFillStat def getCharacterFillStat: String = _characterFillStat def getDateTimeConversionType: String = _dateTimeConversionType def getFieldsToIgnoreInVector: Array[String] = _fieldsToIgnoreInVector def getNAFillFilterPrecision: Double = _naFillFilterPrecision def getCategoricalNAFillMap: Map[String, String] = _categoricalNAFillMap def getNumericNAFillMap: Map[String, AnyVal] = _numericNAFillMap def getCharacterNABlanketFillValue: String = _characterNABlanketFillValue def getNumericNABlanketFillValue: Double = _numericNABlanketFillValue def getNAFillMode: String = _naFillMode def getCardinalitySwitch: Boolean = _cardinalitySwitchFlag def getCardinalityType: String = _cardinalityType def getCardinalityLimit: Int = _cardinalityLimit def getCardinalityPrecision: Double = _cardinalityPrecision def getCardinalityCheckMode: String = _cardinalityCheckMode def getModelSelectionDistinctThreshold: Int = _modelSelectionDistinctThreshold def getFillConfig: FillConfig = _fillConfig def getFilterBounds: String = _filterBounds def getLowerFilterNTile: Double = _lowerFilterNTile def getUpperFilterNTile: Double = _upperFilterNTile def getFilterPrecision: Double = _filterPrecision def getContinuousDataThreshold: Int = _continuousDataThreshold def getFieldsToIgnore: Array[String] = _fieldsToIgnore def getOutlierConfig: OutlierConfig = _outlierConfig def getPearsonFilterStatistic: String = _pearsonFilterStatistic def getPearsonFilterDirection: String = _pearsonFilterDirection def getPearsonFilterManualValue: Double = _pearsonFilterManualValue def getPearsonFilterMode: String = _pearsonFilterMode def getPearsonAutoFilterNTile: Double = _pearsonAutoFilterNTile def getPearsonConfig: PearsonConfig = _pearsonConfig def getCorrelationCutoffLow: Double = _correlationCutoffLow def getCorrelationCutoffHigh: Double = _correlationCutoffHigh def getCovarianceConfig: CovarianceConfig = _covarianceConfig def getScalerType: String = _scalerType def getScalerMin: Double = _scalerMin def getScalerMax: Double = _scalerMax def getStandardScalingMeanFlag: Boolean = _standardScalerMeanFlag def getStandardScalingStdDevFlag: Boolean = _standardScalerStdDevFlag def getPNorm: Double = _pNorm def getScalingConfig: ScalingConfig = _scalingConfig def getFeatureInteractionConfig: FeatureInteractionConfig = _featureInteractionConfig def getFeatureInteractionRetentionMode: String = _featureInteractionRetentionMode def getFeatureInteractionContinuousDiscretizerBucketCount: Int = _featureInteractionContinuousDiscretizerBucketCount def getFeatureInteractionParallelism: Int = _featureInteractionParallelism def getFeatureInteractionTargetInteractionPercentage: Double = _featureInteractionTargetInteractionPercentage def getParallelism: Int = _parallelism def getKFold: Int = _kFold def getTrainPortion: Double = _trainPortion def getTrainSplitMethod: String = _trainSplitMethod def getKSampleConfig: KSampleConfig = _kSampleConfig def getSyntheticCol: String = _syntheticCol def getKGroups: Int = _kGroups def getKMeansMaxIter: Int = _kMeansMaxIter def getKMeansTolerance: Double = _kMeansTolerance def getKMeansDistanceMeasurement: String = _kMeansDistanceMeasurement def getKMeansSeed: Long = _kMeansSeed def getKMeansPredictionCol: String = _kMeansPredictionCol def getLSHHashTables: Int = _lshHashTables def getLSHOutputCol: String = _lshOutputCol def getQuorumCount: Int = _quorumCount def getMinimumVectorCountToMutate: Int = _minimumVectorCountToMutate def getVectorMutationMethod: String = _vectorMutationMethod def getMutationMode: String = _mutationMode def getMutationValue: Double = _mutationValue def getTrainSplitChronologicalColumn: String = _trainSplitChronologicalColumn def getTrainSplitChronologicalRandomPercentage: Double = _trainSplitChronologicalRandomPercentage def getSeed: Long = _seed def getFirstGenerationGenePool: Int = _firstGenerationGenePool def getNumberOfGenerations: Int = _numberOfGenerations def getNumberOfParentsToRetain: Int = _numberOfParentsToRetain def getNumberOfMutationsPerGeneration: Int = _numberOfMutationsPerGeneration def getGeneticMixing: Double = _geneticMixing def getGenerationalMutationStrategy: String = _generationalMutationStrategy def getFixedMutationValue: Int = _fixedMutationValue def getMutationMagnitudeMode: String = _mutationMagnitudeMode def getModelSeedSetStatus: Boolean = _modelSeedSetStatus def getModelSeedMap: Map[String, Any] = _modelSeedMap def getFirstGenerationPermutationCount: Int = _firstGenerationPermutationCount def getFirstGenerationIndexMixingMode: String = _firstGenerationIndexMixingMode def getFirstGenerationArraySeed: Long = _firstGenerationArraySeed def getHyperSpaceInferenceStatus: Boolean = _hyperSpaceInference def getHyperSpaceInferenceCount: Int = _hyperSpaceInferenceCount def getHyperSpaceModelType: String = _hyperSpaceModelType def getHyperSpaceModelCount: Int = _hyperSpaceModelCount def getFirstGenerationConfig: FirstGenerationConfig = _firstGenerationConfig def getFirstGenerationMode: String = _firstGenerationMode def getMlFlowLoggingFlag: Boolean = _mlFlowLoggingFlag def getMlFlowLogArtifactsFlag: Boolean = _mlFlowArtifactsFlag def getMlFlowTrackingURI: String = _mlFlowTrackingURI def getMlFlowExperimentName: String = _mlFlowExperimentName def getMlFlowModelSaveDirectory: String = _mlFlowModelSaveDirectory def getMlFlowLoggingMode: String = _mlFlowLoggingMode def getMlFlowBestSuffix: String = _mlFlowBestSuffix def getMlFlowCustomRunTags: Map[String, String] = _mlFlowCustomRunTags def getMlFlowConfig: MLFlowConfig = _mlFlowConfig def getGeneticConfig: GeneticConfig = _geneticConfig def getMainConfig: MainConfig = _mainConfig def getFeatConfig: MainConfig = _featureImportancesConfig def getTreeSplitsConfig: MainConfig = _treeSplitsConfig def getAutoStoppingFlag: Boolean = _autoStoppingFlag def getAutoStoppingScore: Double = _autoStoppingScore def getFeatureImportanceCutoffType: String = _featureImportanceCutoffType def getFeatureImportanceCutoffValue: Double = _featureImportanceCutoffValue def getEvolutionStrategy: String = _evolutionStrategy def getContinuousEvolutionMaxIterations: Int = _continuousEvolutionMaxIterations def getContinuousEvolutionStoppingScore: Double = _continuousEvolutionStoppingScore def getContinuousEvolutionParallelism: Int = _continuousEvolutionParallelism def getContinuousEvolutionMutationAggressiveness: Int = _continuousEvolutionMutationAggressiveness def getContinuousEvolutionGeneticMixing: Double = _continuousEvolutionGeneticMixing def getContinuousEvolutionRollingImporvementCount: Int = _continuousEvolutionRollingImprovementCount def getInferenceConfigSaveLocation: String = _inferenceConfigSaveLocation def getDataReductionFactor: Double = _dataReductionFactor def getDeltaCacheBackingDirectory: String = _deltaCacheBackingDirectory def getDeltaCacheBackingDirectoryRemovalFlag: Boolean = _deltaCacheBackingDirectoryRemovalFlag def getSplitCachingStrategy: String = _splitCachingStrategy /** * Helper method for extracting the config from a run's GenericModelReturn payload * This is designed to handle "lazy" copy/paste from either stdout or the mlflow ui. * The alternative (preferred method of seeding a run start) is to submit a Map() for the run configuration seed. * * @param fullModelReturn: String The Generic Model Config of a run, to be used as a starting point for further * tuning or refinement. * @return A Map Object that can be parsed into the requisite case class definition to set a seed for a particular * type of model run. */ private def extractGenericModelReturnMap( fullModelReturn: String ): Map[String, Any] = { val patternToMatch = "(?<=\\()[^()]*".r val configElements = patternToMatch.findAllIn(fullModelReturn).toList(1).split(",") var configMap = Map[String, Any]() configElements.foreach { x => val components = x.trim.split(" -> ") configMap += (components(0) -> components(1)) } configMap } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy