All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.ml.regression.LinearRegression.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.ml.regression

import scala.collection.mutable

import breeze.linalg.{DenseVector => BDV}
import breeze.optimize.{CachedDiffFunction, DiffFunction, FirstOrderMinimizer, LBFGS => BreezeLBFGS, LBFGSB => BreezeLBFGSB, OWLQN => BreezeOWLQN}
import breeze.stats.distributions.StudentsT
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.annotation.Since
import org.apache.spark.internal.Logging
import org.apache.spark.ml.{PipelineStage, PredictorParams}
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim.WeightedLeastSquares
import org.apache.spark.ml.optim.aggregator._
import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.stat._
import org.apache.spark.ml.util._
import org.apache.spark.ml.util.Instrumentation.instrumented
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.VectorImplicits._
import org.apache.spark.mllib.regression.{LinearRegressionModel => OldLinearRegressionModel}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.VersionUtils.majorMinorVersion

/**
 * Params for linear regression.
 */
private[regression] trait LinearRegressionParams extends PredictorParams
    with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
    with HasFitIntercept with HasStandardization with HasWeightCol with HasSolver
    with HasAggregationDepth with HasLoss with HasMaxBlockSizeInMB {

  import LinearRegression._

  /**
   * The solver algorithm for optimization.
   * Supported options: "l-bfgs", "normal" and "auto".
   * Default: "auto"
   *
   * @group param
   */
  @Since("1.6.0")
  final override val solver: Param[String] = new Param[String](this, "solver",
    "The solver algorithm for optimization. Supported options: " +
      s"${supportedSolvers.mkString(", ")}. (Default auto)",
    ParamValidators.inArray[String](supportedSolvers))

  /**
   * The loss function to be optimized.
   * Supported options: "squaredError" and "huber".
   * Default: "squaredError"
   *
   * @group param
   */
  @Since("2.3.0")
  final override val loss: Param[String] = new Param[String](this, "loss", "The loss function to" +
    s" be optimized. Supported options: ${supportedLosses.mkString(", ")}. (Default squaredError)",
    ParamValidators.inArray[String](supportedLosses))

  /**
   * The shape parameter to control the amount of robustness. Must be > 1.0.
   * At larger values of epsilon, the huber criterion becomes more similar to least squares
   * regression; for small values of epsilon, the criterion is more similar to L1 regression.
   * Default is 1.35 to get as much robustness as possible while retaining
   * 95% statistical efficiency for normally distributed data. It matches sklearn
   * HuberRegressor and is "M" from 
   * A robust hybrid of lasso and ridge regression.
   * Only valid when "loss" is "huber".
   *
   * @group expertParam
   */
  @Since("2.3.0")
  final val epsilon = new DoubleParam(this, "epsilon", "The shape parameter to control the " +
    "amount of robustness. Must be > 1.0.", ParamValidators.gt(1.0))

  /** @group getExpertParam */
  @Since("2.3.0")
  def getEpsilon: Double = $(epsilon)

  setDefault(regParam -> 0.0, fitIntercept -> true, standardization -> true,
    elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6, solver -> Auto,
    aggregationDepth -> 2, loss -> SquaredError, epsilon -> 1.35, maxBlockSizeInMB -> 0.0)

  override protected def validateAndTransformSchema(
      schema: StructType,
      fitting: Boolean,
      featuresDataType: DataType): StructType = {
    if (fitting) {
      if ($(loss) == Huber) {
        require($(solver)!= Normal, "LinearRegression with huber loss doesn't support " +
          "normal solver, please change solver to auto or l-bfgs.")
        require($(elasticNetParam) == 0.0, "LinearRegression with huber loss only supports " +
          s"L2 regularization, but got elasticNetParam = $getElasticNetParam.")
      }
    }
    super.validateAndTransformSchema(schema, fitting, featuresDataType)
  }
}

/**
 * Linear regression.
 *
 * The learning objective is to minimize the specified loss function, with regularization.
 * This supports two kinds of loss:
 *  - squaredError (a.k.a squared loss)
 *  - huber (a hybrid of squared error for relatively small errors and absolute error for
 *  relatively large ones, and we estimate the scale parameter from training data)
 *
 * This supports multiple types of regularization:
 *  - none (a.k.a. ordinary least squares)
 *  - L2 (ridge regression)
 *  - L1 (Lasso)
 *  - L2 + L1 (elastic net)
 *
 * The squared error objective function is:
 *
 * 
* $$ * \begin{align} * \min_{w}\frac{1}{2n}{\sum_{i=1}^n(X_{i}w - y_{i})^{2} + * \lambda\left[\frac{1-\alpha}{2}{||w||_{2}}^{2} + \alpha{||w||_{1}}\right]} * \end{align} * $$ *
* * The huber objective function is: * *
* $$ * \begin{align} * \min_{w, \sigma}\frac{1}{2n}{\sum_{i=1}^n\left(\sigma + * H_m\left(\frac{X_{i}w - y_{i}}{\sigma}\right)\sigma\right) + \frac{1}{2}\lambda {||w||_2}^2} * \end{align} * $$ *
* * where * *
* $$ * \begin{align} * H_m(z) = \begin{cases} * z^2, & \text {if } |z| < \epsilon, \\ * 2\epsilon|z| - \epsilon^2, & \text{otherwise} * \end{cases} * \end{align} * $$ *
* * Since 3.1.0, it supports stacking instances into blocks and using GEMV for * better performance. * The block size will be 1.0 MB, if param maxBlockSizeInMB is set 0.0 by default. * * Note: Fitting with huber loss only supports none and L2 regularization. */ @Since("1.3.0") class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String) extends Regressor[Vector, LinearRegression, LinearRegressionModel] with LinearRegressionParams with DefaultParamsWritable with Logging { import LinearRegression._ @Since("1.4.0") def this() = this(Identifiable.randomUID("linReg")) /** * Set the regularization parameter. * Default is 0.0. * * @group setParam */ @Since("1.3.0") def setRegParam(value: Double): this.type = set(regParam, value) /** * Set if we should fit the intercept. * Default is true. * * @group setParam */ @Since("1.5.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) /** * Whether to standardize the training features before fitting the model. * The coefficients of models will be always returned on the original scale, * so it will be transparent for users. * Default is true. * * @note With/without standardization, the models should be always converged * to the same solution when no regularization is applied. In R's GLMNET package, * the default behavior is true as well. * * @group setParam */ @Since("1.5.0") def setStandardization(value: Boolean): this.type = set(standardization, value) /** * Set the ElasticNet mixing parameter. * For alpha = 0, the penalty is an L2 penalty. * For alpha = 1, it is an L1 penalty. * For alpha in (0,1), the penalty is a combination of L1 and L2. * Default is 0.0 which is an L2 penalty. * * Note: Fitting with huber loss only supports None and L2 regularization, * so throws exception if this param is non-zero value. * * @group setParam */ @Since("1.4.0") def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) /** * Set the maximum number of iterations. * Default is 100. * * @group setParam */ @Since("1.3.0") def setMaxIter(value: Int): this.type = set(maxIter, value) /** * Set the convergence tolerance of iterations. * Smaller value will lead to higher accuracy with the cost of more iterations. * Default is 1E-6. * * @group setParam */ @Since("1.4.0") def setTol(value: Double): this.type = set(tol, value) /** * Whether to over-/under-sample training instances according to the given weights in weightCol. * If not set or empty, all instances are treated equally (weight 1.0). * Default is not set, so all instances have weight one. * * @group setParam */ @Since("1.6.0") def setWeightCol(value: String): this.type = set(weightCol, value) /** * Set the solver algorithm used for optimization. * In case of linear regression, this can be "l-bfgs", "normal" and "auto". * - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton * optimization method. * - "normal" denotes using Normal Equation as an analytical solution to the linear regression * problem. This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`. * - "auto" (default) means that the solver algorithm is selected automatically. * The Normal Equations solver will be used when possible, but this will automatically fall * back to iterative optimization methods when needed. * * Note: Fitting with huber loss doesn't support normal solver, * so throws exception if this param was set with "normal". * @group setParam */ @Since("1.6.0") def setSolver(value: String): this.type = set(solver, value) /** * Suggested depth for treeAggregate (greater than or equal to 2). * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. * * @group expertSetParam */ @Since("2.1.0") def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) /** * Sets the value of param [[loss]]. * Default is "squaredError". * * @group setParam */ @Since("2.3.0") def setLoss(value: String): this.type = set(loss, value) /** * Sets the value of param [[epsilon]]. * Default is 1.35. * * @group setExpertParam */ @Since("2.3.0") def setEpsilon(value: Double): this.type = set(epsilon, value) /** * Sets the value of param [[maxBlockSizeInMB]]. * Default is 0.0, then 1.0 MB will be chosen. * * @group expertSetParam */ @Since("3.1.0") def setMaxBlockSizeInMB(value: Double): this.type = set(maxBlockSizeInMB, value) override protected def train( dataset: Dataset[_]): LinearRegressionModel = instrumented { instr => instr.logPipelineStage(this) instr.logDataset(dataset) instr.logParams(this, labelCol, featuresCol, weightCol, predictionCol, solver, tol, elasticNetParam, fitIntercept, maxIter, regParam, standardization, aggregationDepth, loss, epsilon, maxBlockSizeInMB) if (dataset.storageLevel != StorageLevel.NONE) { instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + s"then cached during training. Be careful of double caching!") } // Extract the number of features before deciding optimization solver. val numFeatures = MetadataUtils.getNumFeatures(dataset, $(featuresCol)) instr.logNumFeatures(numFeatures) if ($(loss) == SquaredError && (($(solver) == Auto && numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == Normal)) { return trainWithNormal(dataset, instr) } val instances = extractInstances(dataset) .setName("training instances") val (summarizer, labelSummarizer) = Summarizer .getRegressionSummarizers(instances, $(aggregationDepth), Seq("mean", "std", "count")) val yMean = labelSummarizer.mean(0) val rawYStd = labelSummarizer.std(0) instr.logNumExamples(labelSummarizer.count) instr.logNamedValue(Instrumentation.loggerTags.meanOfLabels, yMean) instr.logNamedValue(Instrumentation.loggerTags.varianceOfLabels, rawYStd) instr.logSumOfWeights(summarizer.weightSum) var actualBlockSizeInMB = $(maxBlockSizeInMB) if (actualBlockSizeInMB == 0) { actualBlockSizeInMB = InstanceBlock.DefaultBlockSizeInMB require(actualBlockSizeInMB > 0, "inferred actual BlockSizeInMB must > 0") instr.logNamedValue("actualBlockSizeInMB", actualBlockSizeInMB.toString) } if (rawYStd == 0.0) { if ($(fitIntercept) || yMean == 0.0) { return trainWithConstantLabel(dataset, instr, numFeatures, yMean) } else { require($(regParam) == 0.0, "The standard deviation of the label is zero. " + "Model cannot be regularized.") instr.logWarning(s"The standard deviation of the label is zero. " + "Consider setting fitIntercept=true.") } } // if y is constant (rawYStd is zero), then y cannot be scaled. In this case // setting yStd=abs(yMean) ensures that y is not scaled anymore in l-bfgs algorithm. val yStd = if (rawYStd > 0) rawYStd else math.abs(yMean) val featuresMean = summarizer.mean.toArray val featuresStd = summarizer.std.toArray if (!$(fitIntercept) && (0 until numFeatures).exists(i => featuresStd(i) == 0.0 && featuresMean(i) != 0.0)) { instr.logWarning("Fitting LinearRegressionModel without intercept on dataset with " + "constant nonzero column, Spark MLlib outputs zero coefficients for constant nonzero " + "columns. This behavior is the same as R glmnet but different from LIBSVM.") } // Since we implicitly do the feature scaling when we compute the cost function // to improve the convergence, the effective regParam will be changed. val effectiveRegParam = $(loss) match { case SquaredError => $(regParam) / yStd case Huber => $(regParam) } val effectiveL1RegParam = $(elasticNetParam) * effectiveRegParam val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam val getFeaturesStd = (j: Int) => if (j >= 0 && j < numFeatures) featuresStd(j) else 0.0 val regularization = if (effectiveL2RegParam != 0.0) { val shouldApply = (idx: Int) => idx >= 0 && idx < numFeatures Some(new L2Regularization(effectiveL2RegParam, shouldApply, if ($(standardization)) None else Some(getFeaturesStd))) } else None val optimizer = createOptimizer(effectiveRegParam, effectiveL1RegParam, numFeatures, featuresStd) val initialSolution = $(loss) match { case SquaredError => Array.ofDim[Double](numFeatures) case Huber => val dim = if ($(fitIntercept)) numFeatures + 2 else numFeatures + 1 Array.fill(dim)(1.0) } val (parameters, objectiveHistory) = trainImpl(instances, actualBlockSizeInMB, yMean, yStd, featuresMean, featuresStd, initialSolution, regularization, optimizer) if (parameters == null) { val msg = s"${optimizer.getClass.getName} failed." instr.logError(msg) throw new SparkException(msg) } val model = createModel(parameters, yMean, yStd, featuresMean, featuresStd) // Handle possible missing or invalid prediction columns val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol() val trainingSummary = new LinearRegressionTrainingSummary( summaryModel.transform(dataset), predictionColName, $(labelCol), $(featuresCol), model, Array(0.0), objectiveHistory) model.setSummary(Some(trainingSummary)) } private def trainWithNormal( dataset: Dataset[_], instr: Instrumentation): LinearRegressionModel = { // For low dimensional data, WeightedLeastSquares is more efficient since the // training algorithm only requires one pass through the data. (SPARK-10668) val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = $(elasticNetParam), $(standardization), true, solverType = WeightedLeastSquares.Auto, maxIter = $(maxIter), tol = $(tol)) val instances = extractInstances(dataset) .setName("training instances") val model = optimizer.fit(instances, instr = OptionalInstrumentation.create(instr)) // When it is trained by WeightedLeastSquares, training summary does not // attach returned model. val lrModel = copyValues(new LinearRegressionModel(uid, model.coefficients, model.intercept)) val (summaryModel, predictionColName) = lrModel.findSummaryModelAndPredictionCol() val trainingSummary = new LinearRegressionTrainingSummary( summaryModel.transform(dataset), predictionColName, $(labelCol), $(featuresCol), summaryModel, model.diagInvAtWA.toArray, model.objectiveHistory) lrModel.setSummary(Some(trainingSummary)) } private def trainWithConstantLabel( dataset: Dataset[_], instr: Instrumentation, numFeatures: Int, yMean: Double): LinearRegressionModel = { // If the rawYStd==0 and fitIntercept==true, then the intercept is yMean with // zero coefficient; as a result, training is not needed. // Also, if rawYStd==0 and yMean==0, all the coefficients are zero regardless of // the fitIntercept. if (yMean == 0.0) { instr.logWarning(s"Mean and standard deviation of the label are zero, so the " + s"coefficients and the intercept will all be zero; as a result, training is not " + s"needed.") } else { instr.logWarning(s"The standard deviation of the label is zero, so the coefficients " + s"will be zeros and the intercept will be the mean of the label; as a result, " + s"training is not needed.") } val coefficients = Vectors.sparse(numFeatures, Seq.empty) val intercept = yMean val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept)) // Handle possible missing or invalid prediction columns val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol() val trainingSummary = new LinearRegressionTrainingSummary( summaryModel.transform(dataset), predictionColName, $(labelCol), $(featuresCol), model, Array(0.0), Array(0.0)) model.setSummary(Some(trainingSummary)) } private def createOptimizer( effectiveRegParam: Double, effectiveL1RegParam: Double, numFeatures: Int, featuresStd: Array[Double]) = { $(loss) match { case SquaredError => if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) } else { val standardizationParam = $(standardization) def effectiveL1RegFun = (index: Int) => { if (standardizationParam) { effectiveL1RegParam } else { // If `standardization` is false, we still standardize the data // to improve the rate of convergence; as a result, we have to // perform this reverse standardization by penalizing each component // differently to get effectively the same objective function when // the training dataset is not standardized. if (featuresStd(index) != 0.0) effectiveL1RegParam / featuresStd(index) else 0.0 } } new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, effectiveL1RegFun, $(tol)) } case Huber => val dim = if ($(fitIntercept)) numFeatures + 2 else numFeatures + 1 val lowerBounds = BDV[Double](Array.fill(dim)(Double.MinValue)) // Optimize huber loss in space "\sigma > 0" lowerBounds(dim - 1) = Double.MinPositiveValue val upperBounds = BDV[Double](Array.fill(dim)(Double.MaxValue)) new BreezeLBFGSB(lowerBounds, upperBounds, $(maxIter), 10, $(tol)) } } private def trainImpl( instances: RDD[Instance], actualBlockSizeInMB: Double, yMean: Double, yStd: Double, featuresMean: Array[Double], featuresStd: Array[Double], initialSolution: Array[Double], regularization: Option[L2Regularization], optimizer: FirstOrderMinimizer[BDV[Double], DiffFunction[BDV[Double]]]) = { val numFeatures = featuresStd.length val inverseStd = featuresStd.map(std => if (std != 0) 1.0 / std else 0.0) val scaledMean = Array.tabulate(numFeatures)(i => inverseStd(i) * featuresMean(i)) val bcInverseStd = instances.context.broadcast(inverseStd) val bcScaledMean = instances.context.broadcast(scaledMean) val standardized = instances.mapPartitions { iter => val func = StandardScalerModel.getTransformFunc(Array.empty, bcInverseStd.value, false, true) iter.map { case Instance(label, weight, vec) => Instance(label, weight, func(vec)) } } val maxMemUsage = (actualBlockSizeInMB * 1024L * 1024L).ceil.toLong val blocks = InstanceBlock.blokifyWithMaxMemUsage(standardized, maxMemUsage) .persist(StorageLevel.MEMORY_AND_DISK) .setName(s"$uid: training blocks (blockSizeInMB=$actualBlockSizeInMB)") if ($(fitIntercept) && $(loss) == Huber) { // orginal `initialSolution` is for problem: // y = f(w1 * x1 / std_x1, w2 * x2 / std_x2, ..., intercept) // we should adjust it to the initial solution for problem: // y = f(w1 * (x1 - avg_x1) / std_x1, w2 * (x2 - avg_x2) / std_x2, ..., intercept) // NOTE: this is NOOP before we finally support model initialization val adapt = BLAS.javaBLAS.ddot(numFeatures, initialSolution, 1, scaledMean, 1) initialSolution(numFeatures) += adapt } val costFun = $(loss) match { case SquaredError => val getAggregatorFunc = new LeastSquaresBlockAggregator(bcInverseStd, bcScaledMean, $(fitIntercept), yStd, yMean)(_) new RDDLossFunction(blocks, getAggregatorFunc, regularization, $(aggregationDepth)) case Huber => val getAggregatorFunc = new HuberBlockAggregator(bcInverseStd, bcScaledMean, $(fitIntercept), $(epsilon))(_) new RDDLossFunction(blocks, getAggregatorFunc, regularization, $(aggregationDepth)) } val states = optimizer.iterations(new CachedDiffFunction(costFun), new BDV(initialSolution)) /* Note that in Linear Regression, the objective history (loss + regularization) returned from optimizer is computed in the scaled space given by the following formula.
$$ L &= 1/2n||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2 + regTerms \\ $$
*/ val arrayBuilder = mutable.ArrayBuilder.make[Double] var state: optimizer.State = null while (states.hasNext) { state = states.next() arrayBuilder += state.adjustedValue } blocks.unpersist() bcInverseStd.destroy() bcScaledMean.destroy() val solution = if (state == null) null else state.x.toArray if ($(fitIntercept) && $(loss) == Huber && solution != null) { // the final solution is for problem: // y = f(w1 * (x1 - avg_x1) / std_x1, w2 * (x2 - avg_x2) / std_x2, ..., intercept) // we should adjust it back for original problem: // y = f(w1 * x1 / std_x1, w2 * x2 / std_x2, ..., intercept) val adapt = BLAS.javaBLAS.ddot(numFeatures, solution, 1, scaledMean, 1) solution(numFeatures) -= adapt } (solution, arrayBuilder.result) } private def createModel( solution: Array[Double], yMean: Double, yStd: Double, featuresMean: Array[Double], featuresStd: Array[Double]): LinearRegressionModel = { val numFeatures = featuresStd.length /* The coefficients are trained in the scaled space; we're converting them back to the original space. */ val multiplier = if ($(loss) == Huber) 1.0 else yStd val rawCoefficients = Array.tabulate(numFeatures) { i => if (featuresStd(i) != 0) solution(i) * multiplier / featuresStd(i) else 0.0 } val intercept = if ($(fitIntercept)) { $(loss) match { case SquaredError => /* The intercept of squared error in R's GLMNET is computed using closed form after the coefficients are converged. See the following discussion for detail. http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet */ yMean - BLAS.dot(Vectors.dense(rawCoefficients), Vectors.dense(featuresMean)) case Huber => solution(numFeatures) } } else 0.0 val coefficients = Vectors.dense(rawCoefficients).compressed val scale = if ($(loss) == Huber) solution.last else 1.0 copyValues(new LinearRegressionModel(uid, coefficients, intercept, scale)) } @Since("1.4.0") override def copy(extra: ParamMap): LinearRegression = defaultCopy(extra) } @Since("1.6.0") object LinearRegression extends DefaultParamsReadable[LinearRegression] { @Since("1.6.0") override def load(path: String): LinearRegression = super.load(path) /** * When using `LinearRegression.solver` == "normal", the solver must limit the number of * features to at most this number. The entire covariance matrix X^T^X will be collected * to the driver. This limit helps prevent memory overflow errors. */ @Since("2.1.0") val MAX_FEATURES_FOR_NORMAL_SOLVER: Int = WeightedLeastSquares.MAX_NUM_FEATURES /** String name for "auto". */ private[regression] val Auto = "auto" /** String name for "normal". */ private[regression] val Normal = "normal" /** String name for "l-bfgs". */ private[regression] val LBFGS = "l-bfgs" /** Set of solvers that LinearRegression supports. */ private[regression] val supportedSolvers = Array(Auto, Normal, LBFGS) /** String name for "squaredError". */ private[regression] val SquaredError = "squaredError" /** String name for "huber". */ private[regression] val Huber = "huber" /** Set of loss function names that LinearRegression supports. */ private[regression] val supportedLosses = Array(SquaredError, Huber) } /** * Model produced by [[LinearRegression]]. */ @Since("1.3.0") class LinearRegressionModel private[ml] ( @Since("1.4.0") override val uid: String, @Since("2.0.0") val coefficients: Vector, @Since("1.3.0") val intercept: Double, @Since("2.3.0") val scale: Double) extends RegressionModel[Vector, LinearRegressionModel] with LinearRegressionParams with GeneralMLWritable with HasTrainingSummary[LinearRegressionTrainingSummary] { private[ml] def this(uid: String, coefficients: Vector, intercept: Double) = this(uid, coefficients, intercept, 1.0) override val numFeatures: Int = coefficients.size /** * Gets summary (e.g. residuals, mse, r-squared ) of model on training set. An exception is * thrown if `hasSummary` is false. */ @Since("1.5.0") override def summary: LinearRegressionTrainingSummary = super.summary /** * Evaluates the model on a test dataset. * * @param dataset Test dataset to evaluate model on. */ @Since("2.0.0") def evaluate(dataset: Dataset[_]): LinearRegressionSummary = { // Handle possible missing or invalid prediction columns val (summaryModel, predictionColName) = findSummaryModelAndPredictionCol() new LinearRegressionSummary(summaryModel.transform(dataset), predictionColName, $(labelCol), $(featuresCol), summaryModel, Array(0.0)) } /** * If the prediction column is set returns the current model and prediction column, * otherwise generates a new column and sets it as the prediction column on a new copy * of the current model. */ private[regression] def findSummaryModelAndPredictionCol(): (LinearRegressionModel, String) = { $(predictionCol) match { case "" => val predictionColName = "prediction_" + java.util.UUID.randomUUID.toString (copy(ParamMap.empty).setPredictionCol(predictionColName), predictionColName) case p => (this, p) } } override def predict(features: Vector): Double = { BLAS.dot(features, coefficients) + intercept } @Since("1.4.0") override def copy(extra: ParamMap): LinearRegressionModel = { val newModel = copyValues(new LinearRegressionModel(uid, coefficients, intercept), extra) newModel.setSummary(trainingSummary).setParent(parent) } /** * Returns a [[org.apache.spark.ml.util.GeneralMLWriter]] instance for this ML instance. * * For [[LinearRegressionModel]], this does NOT currently save the training [[summary]]. * An option to save [[summary]] may be added in the future. * * This also does not save the [[parent]] currently. */ @Since("1.6.0") override def write: GeneralMLWriter = new GeneralMLWriter(this) @Since("3.0.0") override def toString: String = { s"LinearRegressionModel: uid=$uid, numFeatures=$numFeatures" } } /** A writer for LinearRegression that handles the "internal" (or default) format */ private class InternalLinearRegressionModelWriter extends MLWriterFormat with MLFormatRegister { override def format(): String = "internal" override def stageName(): String = "org.apache.spark.ml.regression.LinearRegressionModel" private case class Data(intercept: Double, coefficients: Vector, scale: Double) override def write(path: String, sparkSession: SparkSession, optionMap: mutable.Map[String, String], stage: PipelineStage): Unit = { val instance = stage.asInstanceOf[LinearRegressionModel] val sc = sparkSession.sparkContext // Save metadata and Params DefaultParamsWriter.saveMetadata(instance, path, sc) // Save model data: intercept, coefficients, scale val data = Data(instance.intercept, instance.coefficients, instance.scale) val dataPath = new Path(path, "data").toString sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } } /** A writer for LinearRegression that handles the "pmml" format */ private class PMMLLinearRegressionModelWriter extends MLWriterFormat with MLFormatRegister { override def format(): String = "pmml" override def stageName(): String = "org.apache.spark.ml.regression.LinearRegressionModel" private case class Data(intercept: Double, coefficients: Vector) override def write(path: String, sparkSession: SparkSession, optionMap: mutable.Map[String, String], stage: PipelineStage): Unit = { val sc = sparkSession.sparkContext // Construct the MLLib model which knows how to write to PMML. val instance = stage.asInstanceOf[LinearRegressionModel] val oldModel = new OldLinearRegressionModel(instance.coefficients, instance.intercept) // Save PMML oldModel.toPMML(sc, path) } } @Since("1.6.0") object LinearRegressionModel extends MLReadable[LinearRegressionModel] { @Since("1.6.0") override def read: MLReader[LinearRegressionModel] = new LinearRegressionModelReader @Since("1.6.0") override def load(path: String): LinearRegressionModel = super.load(path) private class LinearRegressionModelReader extends MLReader[LinearRegressionModel] { /** Checked against metadata when loading model */ private val className = classOf[LinearRegressionModel].getName override def load(path: String): LinearRegressionModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) val (majorVersion, minorVersion) = majorMinorVersion(metadata.sparkVersion) val model = if (majorVersion < 2 || (majorVersion == 2 && minorVersion <= 2)) { // Spark 2.2 and before val Row(intercept: Double, coefficients: Vector) = MLUtils.convertVectorColumnsToML(data, "coefficients") .select("intercept", "coefficients") .head() new LinearRegressionModel(metadata.uid, coefficients, intercept) } else { // Spark 2.3 and later val Row(intercept: Double, coefficients: Vector, scale: Double) = data.select("intercept", "coefficients", "scale").head() new LinearRegressionModel(metadata.uid, coefficients, intercept, scale) } metadata.getAndSetParams(model) model } } } /** * Linear regression training results. Currently, the training summary ignores the * training weights except for the objective trace. * * @param predictions predictions output by the model's `transform` method. * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. */ @Since("1.5.0") class LinearRegressionTrainingSummary private[regression] ( predictions: DataFrame, predictionCol: String, labelCol: String, featuresCol: String, model: LinearRegressionModel, diagInvAtWA: Array[Double], val objectiveHistory: Array[Double]) extends LinearRegressionSummary( predictions, predictionCol, labelCol, featuresCol, model, diagInvAtWA) { /** * Number of training iterations until termination * * This value is only available when using the "l-bfgs" solver. * * @see `LinearRegression.solver` */ @Since("1.5.0") val totalIterations = { assert(objectiveHistory.length > 0, s"objectiveHistory length should be greater than 1.") objectiveHistory.length - 1 } } /** * Linear regression results evaluated on a dataset. * * @param predictions predictions output by the model's `transform` method. * @param predictionCol Field in "predictions" which gives the predicted value of the label at * each instance. * @param labelCol Field in "predictions" which gives the true label of each instance. * @param featuresCol Field in "predictions" which gives the features of each instance as a vector. */ @Since("1.5.0") class LinearRegressionSummary private[regression] ( @transient val predictions: DataFrame, val predictionCol: String, val labelCol: String, val featuresCol: String, private val privateModel: LinearRegressionModel, private val diagInvAtWA: Array[Double]) extends Serializable { @transient private val metrics = { val weightCol = if (!privateModel.isDefined(privateModel.weightCol) || privateModel.getWeightCol.isEmpty) { lit(1.0) } else { col(privateModel.getWeightCol).cast(DoubleType) } new RegressionMetrics( predictions .select(col(predictionCol), col(labelCol).cast(DoubleType), weightCol) .rdd .map { case Row(pred: Double, label: Double, weight: Double) => (pred, label, weight) }, !privateModel.getFitIntercept) } /** * Returns the explained variance regression score. * explainedVariance = 1 - variance(y - \hat{y}) / variance(y) * Reference: * Wikipedia explain variation */ @Since("1.5.0") val explainedVariance: Double = metrics.explainedVariance /** * Returns the mean absolute error, which is a risk function corresponding to the * expected value of the absolute error loss or l1-norm loss. */ @Since("1.5.0") val meanAbsoluteError: Double = metrics.meanAbsoluteError /** * Returns the mean squared error, which is a risk function corresponding to the * expected value of the squared error loss or quadratic loss. */ @Since("1.5.0") val meanSquaredError: Double = metrics.meanSquaredError /** * Returns the root mean squared error, which is defined as the square root of * the mean squared error. */ @Since("1.5.0") val rootMeanSquaredError: Double = metrics.rootMeanSquaredError /** * Returns R^2^, the coefficient of determination. * Reference: * Wikipedia coefficient of determination */ @Since("1.5.0") val r2: Double = metrics.r2 /** * Returns Adjusted R^2^, the adjusted coefficient of determination. * Reference: * Wikipedia coefficient of determination */ @Since("2.3.0") val r2adj: Double = { val interceptDOF = if (privateModel.getFitIntercept) 1 else 0 1 - (1 - r2) * (numInstances - interceptDOF) / (numInstances - privateModel.coefficients.size - interceptDOF) } /** Residuals (label - predicted value) */ @Since("1.5.0") @transient lazy val residuals: DataFrame = { val t = udf { (pred: Double, label: Double) => label - pred } predictions.select(t(col(predictionCol), col(labelCol)).as("residuals")) } /** Number of instances in DataFrame predictions */ lazy val numInstances: Long = metrics.count /** Degrees of freedom */ @Since("2.2.0") val degreesOfFreedom: Long = if (privateModel.getFitIntercept) { numInstances - privateModel.coefficients.size - 1 } else { numInstances - privateModel.coefficients.size } /** * The weighted residuals, the usual residuals rescaled by * the square root of the instance weights. */ lazy val devianceResiduals: Array[Double] = { val weighted = if (!privateModel.isDefined(privateModel.weightCol) || privateModel.getWeightCol.isEmpty) { lit(1.0) } else { sqrt(col(privateModel.getWeightCol)) } val dr = predictions .select(col(privateModel.getLabelCol).minus(col(privateModel.getPredictionCol)) .multiply(weighted).as("weightedResiduals")) .select(min(col("weightedResiduals")).as("min"), max(col("weightedResiduals")).as("max")) .first() Array(dr.getDouble(0), dr.getDouble(1)) } /** * Standard error of estimated coefficients and intercept. * This value is only available when using the "normal" solver. * * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * * @see `LinearRegression.solver` */ lazy val coefficientStandardErrors: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { throw new UnsupportedOperationException( "No Std. Error of coefficients available for this LinearRegressionModel") } else { val rss = if (!privateModel.isDefined(privateModel.weightCol) || privateModel.getWeightCol.isEmpty) { meanSquaredError * numInstances } else { val t = udf { (pred: Double, label: Double, weight: Double) => math.pow(label - pred, 2.0) * weight } predictions.select(t(col(privateModel.getPredictionCol), col(privateModel.getLabelCol), col(privateModel.getWeightCol)).as("wse")).agg(sum(col("wse"))).first().getDouble(0) } val sigma2 = rss / degreesOfFreedom diagInvAtWA.map(_ * sigma2).map(math.sqrt) } } /** * T-statistic of estimated coefficients and intercept. * This value is only available when using the "normal" solver. * * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * * @see `LinearRegression.solver` */ lazy val tValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { throw new UnsupportedOperationException( "No t-statistic available for this LinearRegressionModel") } else { val estimate = if (privateModel.getFitIntercept) { Array.concat(privateModel.coefficients.toArray, Array(privateModel.intercept)) } else { privateModel.coefficients.toArray } estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 } } } /** * Two-sided p-value of estimated coefficients and intercept. * This value is only available when using the "normal" solver. * * If `LinearRegression.fitIntercept` is set to true, * then the last element returned corresponds to the intercept. * * @see `LinearRegression.solver` */ lazy val pValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { throw new UnsupportedOperationException( "No p-value available for this LinearRegressionModel") } else { tValues.map { x => 2.0 * (1.0 - StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy