org.apache.spark.ml.odkl.Scaler.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ok-ml-pipelines_2.10 Show documentation
ok-ml-pipelines
The newest version!
package org.apache.spark.ml.odkl

/**
  * ml.odkl is an extension to Spark ML package with intention to
  * 1. Provide a modular structure with shared and tested common code
  * 2. Add ability to create train-only transformation (for better prediction performance)
  * 3. Unify extra information generation by the model fitters
  * 4. Support combined models with option for parallel training.
  *
  * This particular file contains utility used for linear scaling of features based on mean and variance.
  */

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.shared.HasFeaturesCol
import org.apache.spark.ml.param.{Param, BooleanParam, ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.Estimator
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, BLAS, Vector}
import org.apache.spark.sql.odkl.SparkSqlUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, functions}

/**
  * Scaler parameters.
  */
trait ScalerParams extends Params with HasFeaturesCol {

  /**
    * Whether to center the data with mean before scaling.
    * It will build a dense output, so this does not work on sparse input
    * and will raise an exception.
    * Default: false
    *
    * @group param
    */
  val withMean: BooleanParam = new BooleanParam(this, "withMean",
    "Whether to center data with mean")

  /** @group getParam */
  def getWithMean: Boolean = $(withMean)

  def setWithMean(value: Boolean): this.type = set(withMean, value)

  /**
    * Whether to scale the data to unit standard deviation.
    * Default: true
    *
    * @group param
    */
  val withStd: BooleanParam = new BooleanParam(this, "withStd",
    "Whether to scale the data to unit standard deviation")

  /** @group getParam */
  def getWithStd: Boolean = $(withStd)

  def setWithStd(value: Boolean): this.type = set(withStd, value)

  setDefault(withMean -> true, withStd -> true)
}

/**
  * This is a specific implementation of the scaler for linear models. Uses the ability to propagate scaling to the
  * weights to avoid overhead when predicting.
  */
class ScalerEstimator[M <: ModelWithSummary[M]]
(
  override val uid: String = Identifiable.randomUID("scalerEstimator"))
  extends Estimator[Scaler.Unscaler[M]] with ScalerParams with DefaultParamsWritable {

  private[odkl] val modelTransformer = new Param[(M,StandardScalerModel) => M](
    this, "modelTransformer", "Function used to transform nested model.")

  override def fit(dataset: DataFrame): Scaler.Unscaler[M] = {
    val scaler = new StandardScaler($(withMean), $(withStd)).fit(dataset.select($(featuresCol)).map(_.getAs[Vector](0)))

    new Scaler.Unscaler[M](scaler, $(modelTransformer)).setParent(this)
  }

  override def copy(extra: ParamMap) = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = schema
}

object Scaler extends Serializable {
  /**
    * Given a linear model estimator first scale the data based on mean/variance, then train the model
    * and unscale its weights.
    *
    * @param estimator Nested linear model estimator.
    * @param scaler    Pre configured scaler (by default with mean and sd).
    * @return Linear model as produced by the nested estimator with unscaled weights.
    */
  def scale[M <: LinearModel[M]]
  (
    estimator: SummarizableEstimator[M],
    scaler: ScalerEstimator[M] = new ScalerEstimator[M]())
  (implicit m: Manifest[M])
  : UnwrappedStage[M, Unscaler[M]] = {

    new UnwrappedStage[M, Unscaler[M]](estimator, scaler.set(scaler.modelTransformer, transformModel[M] _))
  }

  /**
    * Extention for scaling composite models assuming their are composites of linear models.
    *
    * @param estimator Nested composite estimator.
    * @param scaler Scaler with te settings.
    * @return Composite model as produced by the nested estimator with all parts unscaled.
    */
  def scaleComposite[M <: LinearModel[M], C <: CombinedModel[M, C]]
  (
    estimator: SummarizableEstimator[C],
    scaler: ScalerEstimator[C] = new ScalerEstimator[C]()) : UnwrappedStage[C, Unscaler[C]] = {
    new UnwrappedStage[C, Unscaler[C]](estimator, scaler.set(
      scaler.modelTransformer, (model : C, scalerModel: StandardScalerModel) => model.transformNested(x => transformModel[M](x, scalerModel))))
  }


  private def transformData(dataset: DataFrame, featuresCol: String, scaler: StandardScalerModel): DataFrame = {

    val structField = dataset.schema.fields(dataset.schema.fieldIndex(featuresCol))

    val transform = SparkSqlUtils.reflectionLock.synchronized(
      if (scaler.withMean) {
        // Only dense vector can be scaled with mean.
        functions.udf[Vector, Vector](x => scaler.transform(x.toDense))
      }
      else {
        functions.udf[Vector, Vector](x => scaler.transform(x))
      })

    dataset.withColumn(
      featuresCol, transform(dataset(featuresCol)).as(featuresCol, structField.metadata))
  }

  private def transformModel[M <: LinearModel[M]](model: M, scaler: StandardScalerModel): M = {
    val nestedSummary: ModelSummary = model.summary

    val coefficientsAndIntercept = if (scaler.withMean) {
      val noMeanModel = scaler.setWithMean(false)

      val newCoefficients = noMeanModel.transform(model.getCoefficients)
      val interceptDelta = BLAS.dot(newCoefficients, scaler.mean)

      (newCoefficients, model.getIntercept - interceptDelta)

    } else {
      (scaler.transform(model.getCoefficients), model.getIntercept)
    }

    val summary = SparkSqlUtils.reflectionLock.synchronized(
      nestedSummary.transform(model.weights -> (data => {
        val mean = functions.udf[Double, Int](i => if (i >= 0) scaler.mean(i) else 0.0)
        val std = functions.udf[Double, Int](i => if (i >= 0) scaler.std(i) else 0.0)
        val weight = functions.udf[Double, Int](i => if (i >= 0) coefficientsAndIntercept._1(i) else coefficientsAndIntercept._2)

        // TODO: Configure column names
        data
          .withColumnRenamed(model.weight, s"unscaled_${model.weight}")
          .withColumn(model.weight, weight(data(model.index)))
          .withColumn("value_mean", mean(data(model.index)))
          .withColumn("value_std", std(data(model.index)))
      }))
    )

    model.copy(
      summary,
      ParamMap(
        model.coefficients -> coefficientsAndIntercept._1,
        model.intercept -> coefficientsAndIntercept._2))
  }

  /**
    * Applies unscaling to the linear model weights.
    */
  class Unscaler[M <: ModelWithSummary[M]]
  (
    val scaler: StandardScalerModel,
    val modelTransformer : (M, StandardScalerModel) => M,
    override val uid: String = Identifiable.randomUID("unscaler")
  )
    extends ModelTransformer[M, Unscaler[M]] with HasFeaturesCol with ScalerParams {

    override def transform(dataset: DataFrame): DataFrame = Scaler.transformData(dataset, $(featuresCol), scaler)

    override def transformModel(model: M, originalData: DataFrame): M = modelTransformer(model, scaler)

    @DeveloperApi
    override def transformSchema(schema: StructType): StructType = schema
  }

}