ai.h2o.sparkling.ml.params.H2OPCAParams.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sparkling-water-ml_2.11 Show documentation
Sparkling Water ML Pipelines
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package ai.h2o.sparkling.ml.params

import hex.pca.PCAModel.PCAParameters
import ai.h2o.sparkling.H2OFrame
import hex.DataInfo.TransformType
import hex.pca.PCAModel.PCAParameters.Method
import hex.pca.PCAImplementation

trait H2OPCAParams
  extends H2OAlgoParamsBase
  with HasIgnoredCols {

  protected def paramTag = reflect.classTag[PCAParameters]

  //
  // Parameter definitions
  //
  protected val transform = stringParam(
    name = "transform",
    doc = """Transformation of training data. Possible values are ``"NONE"``, ``"STANDARDIZE"``, ``"NORMALIZE"``, ``"DEMEAN"``, ``"DESCALE"``.""")

  protected val pcaMethod = stringParam(
    name = "pcaMethod",
    doc = """Specify the algorithm to use for computing the principal components: GramSVD - uses a distributed computation of the Gram matrix, followed by a local SVD; Power - computes the SVD using the power iteration method (experimental); Randomized - uses randomized subspace iteration method; GLRM - fits a generalized low-rank model with L2 loss function and no regularization and solves for the SVD using local matrix algebra (experimental). Possible values are ``"GramSVD"``, ``"Power"``, ``"Randomized"``, ``"GLRM"``.""")

  protected val pcaImpl = stringParam(
    name = "pcaImpl",
    doc = """Specify the implementation to use for computing PCA (via SVD or EVD): MTJ_EVD_DENSEMATRIX - eigenvalue decompositions for dense matrix using MTJ; MTJ_EVD_SYMMMATRIX - eigenvalue decompositions for symmetric matrix using MTJ; MTJ_SVD_DENSEMATRIX - singular-value decompositions for dense matrix using MTJ; JAMA - eigenvalue decompositions for dense matrix using JAMA. References: JAMA - http://math.nist.gov/javanumerics/jama/; MTJ - https://github.com/fommil/matrix-toolkits-java/. Possible values are ``"MTJ_EVD_DENSEMATRIX"``, ``"MTJ_EVD_SYMMMATRIX"``, ``"MTJ_SVD_DENSEMATRIX"``, ``"JAMA"``.""")

  protected val k = intParam(
    name = "k",
    doc = """Rank of matrix approximation.""")

  protected val maxIterations = intParam(
    name = "maxIterations",
    doc = """Maximum training iterations.""")

  protected val seed = longParam(
    name = "seed",
    doc = """RNG seed for initialization.""")

  protected val useAllFactorLevels = booleanParam(
    name = "useAllFactorLevels",
    doc = """Whether first factor level is included in each categorical expansion.""")

  protected val computeMetrics = booleanParam(
    name = "computeMetrics",
    doc = """Whether to compute metrics on the training data.""")

  protected val imputeMissing = booleanParam(
    name = "imputeMissing",
    doc = """Whether to impute missing entries with the column mean.""")

  protected val modelId = nullableStringParam(
    name = "modelId",
    doc = """Destination id for this model; auto-generated if not specified.""")

  protected val ignoreConstCols = booleanParam(
    name = "ignoreConstCols",
    doc = """Ignore constant columns.""")

  protected val scoreEachIteration = booleanParam(
    name = "scoreEachIteration",
    doc = """Whether to score during each iteration of model training.""")

  protected val maxRuntimeSecs = doubleParam(
    name = "maxRuntimeSecs",
    doc = """Maximum allowed runtime in seconds for model training. Use 0 to disable.""")

  protected val exportCheckpointsDir = nullableStringParam(
    name = "exportCheckpointsDir",
    doc = """Automatically export generated models to this directory.""")

  //
  // Default values
  //
  setDefault(
    transform -> TransformType.NONE.name(),
    pcaMethod -> Method.GramSVD.name(),
    pcaImpl -> PCAImplementation.MTJ_EVD_SYMMMATRIX.name(),
    k -> 1,
    maxIterations -> 1000,
    seed -> -1L,
    useAllFactorLevels -> false,
    computeMetrics -> true,
    imputeMissing -> false,
    modelId -> null,
    ignoreConstCols -> true,
    scoreEachIteration -> false,
    maxRuntimeSecs -> 0.0,
    exportCheckpointsDir -> null)

  //
  // Getters
  //
  def getTransform(): String = $(transform)

  def getPcaMethod(): String = $(pcaMethod)

  def getPcaImpl(): String = $(pcaImpl)

  def getK(): Int = $(k)

  def getMaxIterations(): Int = $(maxIterations)

  def getSeed(): Long = $(seed)

  def getUseAllFactorLevels(): Boolean = $(useAllFactorLevels)

  def getComputeMetrics(): Boolean = $(computeMetrics)

  def getImputeMissing(): Boolean = $(imputeMissing)

  def getModelId(): String = $(modelId)

  def getIgnoreConstCols(): Boolean = $(ignoreConstCols)

  def getScoreEachIteration(): Boolean = $(scoreEachIteration)

  def getMaxRuntimeSecs(): Double = $(maxRuntimeSecs)

  def getExportCheckpointsDir(): String = $(exportCheckpointsDir)

  //
  // Setters
  //
  def setTransform(value: String): this.type = {
    val validated = EnumParamValidator.getValidatedEnumValue[TransformType](value)
    set(transform, validated)
  }
           
  def setPcaMethod(value: String): this.type = {
    val validated = EnumParamValidator.getValidatedEnumValue[Method](value)
    set(pcaMethod, validated)
  }
           
  def setPcaImpl(value: String): this.type = {
    val validated = EnumParamValidator.getValidatedEnumValue[PCAImplementation](value)
    set(pcaImpl, validated)
  }
           
  def setK(value: Int): this.type = {
    set(k, value)
  }
           
  def setMaxIterations(value: Int): this.type = {
    set(maxIterations, value)
  }
           
  def setSeed(value: Long): this.type = {
    set(seed, value)
  }
           
  def setUseAllFactorLevels(value: Boolean): this.type = {
    set(useAllFactorLevels, value)
  }
           
  def setComputeMetrics(value: Boolean): this.type = {
    set(computeMetrics, value)
  }
           
  def setImputeMissing(value: Boolean): this.type = {
    set(imputeMissing, value)
  }
           
  def setModelId(value: String): this.type = {
    set(modelId, value)
  }
           
  def setIgnoreConstCols(value: Boolean): this.type = {
    set(ignoreConstCols, value)
  }
           
  def setScoreEachIteration(value: Boolean): this.type = {
    set(scoreEachIteration, value)
  }
           
  def setMaxRuntimeSecs(value: Double): this.type = {
    set(maxRuntimeSecs, value)
  }
           
  def setExportCheckpointsDir(value: String): this.type = {
    set(exportCheckpointsDir, value)
  }
           

  override private[sparkling] def getH2OAlgorithmParams(trainingFrame: H2OFrame): Map[String, Any] = {
    super.getH2OAlgorithmParams(trainingFrame) ++ getH2OPCAParams(trainingFrame)
  }

  private[sparkling] def getH2OPCAParams(trainingFrame: H2OFrame): Map[String, Any] = {
      Map(
        "transform" -> getTransform(),
        "pca_method" -> getPcaMethod(),
        "pca_impl" -> getPcaImpl(),
        "k" -> getK(),
        "max_iterations" -> getMaxIterations(),
        "seed" -> getSeed(),
        "use_all_factor_levels" -> getUseAllFactorLevels(),
        "compute_metrics" -> getComputeMetrics(),
        "impute_missing" -> getImputeMissing(),
        "model_id" -> getModelId(),
        "ignore_const_cols" -> getIgnoreConstCols(),
        "score_each_iteration" -> getScoreEachIteration(),
        "max_runtime_secs" -> getMaxRuntimeSecs(),
        "export_checkpoints_dir" -> getExportCheckpointsDir()) +++
      getIgnoredColsParam(trainingFrame)
  }

  override private[sparkling] def getSWtoH2OParamNameMap(): Map[String, String] = {
    super.getSWtoH2OParamNameMap() ++
      Map(
        "transform" -> "transform",
        "pcaMethod" -> "pca_method",
        "pcaImpl" -> "pca_impl",
        "k" -> "k",
        "maxIterations" -> "max_iterations",
        "seed" -> "seed",
        "useAllFactorLevels" -> "use_all_factor_levels",
        "computeMetrics" -> "compute_metrics",
        "imputeMissing" -> "impute_missing",
        "modelId" -> "model_id",
        "ignoreConstCols" -> "ignore_const_cols",
        "scoreEachIteration" -> "score_each_iteration",
        "maxRuntimeSecs" -> "max_runtime_secs",
        "exportCheckpointsDir" -> "export_checkpoints_dir")
  }
      
}