org.apache.flink.ml.regression.MultipleLinearRegression.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.regression
import org.apache.flink.api.scala.DataSet
import org.apache.flink.ml.math.{Breeze, Vector}
import org.apache.flink.ml.common._
import org.apache.flink.api.scala._
import org.apache.flink.ml.optimization.{LinearPrediction, SquaredLoss, GenericLossFunction, SimpleGradientDescent}
import org.apache.flink.ml.pipeline.{PredictOperation, FitOperation, Predictor}
/** Multiple linear regression using the ordinary least squares (OLS) estimator.
*
* The linear regression finds a solution to the problem
*
* `y = w_0 + w_1*x_1 + w_2*x_2 ... + w_n*x_n = w_0 + w^T*x`
*
* such that the sum of squared residuals is minimized
*
* `min_{w, w_0} \sum (y - w^T*x - w_0)^2`
*
* The minimization problem is solved by (stochastic) gradient descent. For each labeled vector
* `(x,y)`, the gradient is calculated. The weighted average of all gradients is subtracted from
* the current value `w` which gives the new value of `w_new`. The weight is defined as
* `stepsize/math.sqrt(iteration)`.
*
* The optimization runs at most a maximum number of iterations or, if a convergence threshold has
* been set, until the convergence criterion has been met. As convergence criterion the relative
* change of the sum of squared residuals is used:
*
* `(S_{k-1} - S_k)/S_{k-1} < \rho`
*
* with S_k being the sum of squared residuals in iteration k and `\rho` being the convergence
* threshold.
*
* At the moment, the whole partition is used for SGD, making it effectively a batch gradient
* descent. Once a sampling operator has been introduced, the algorithm can be optimized.
*
* @example
* {{{
* val mlr = MultipleLinearRegression()
* .setIterations(10)
* .setStepsize(0.5)
* .setConvergenceThreshold(0.001)
*
* val trainingDS: DataSet[LabeledVector] = ...
* val testingDS: DataSet[Vector] = ...
*
* mlr.fit(trainingDS)
*
* val predictions = mlr.predict(testingDS)
* }}}
*
* =Parameters=
*
* - [[org.apache.flink.ml.regression.MultipleLinearRegression.Iterations]]:
* Maximum number of iterations.
*
* - [[org.apache.flink.ml.regression.MultipleLinearRegression.Stepsize]]:
* Initial step size for the gradient descent method.
* This value controls how far the gradient descent method moves in the opposite direction of the
* gradient. Tuning this parameter might be crucial to make it stable and to obtain a better
* performance.
*
* - [[org.apache.flink.ml.regression.MultipleLinearRegression.ConvergenceThreshold]]:
* Threshold for relative change of sum of squared residuals until convergence.
*
*/
class MultipleLinearRegression extends Predictor[MultipleLinearRegression] {
import org.apache.flink.ml._
import MultipleLinearRegression._
// Stores the weights of the linear model after the fitting phase
var weightsOption: Option[DataSet[WeightVector]] = None
def setIterations(iterations: Int): MultipleLinearRegression = {
parameters.add(Iterations, iterations)
this
}
def setStepsize(stepsize: Double): MultipleLinearRegression = {
parameters.add(Stepsize, stepsize)
this
}
def setConvergenceThreshold(convergenceThreshold: Double): MultipleLinearRegression = {
parameters.add(ConvergenceThreshold, convergenceThreshold)
this
}
def squaredResidualSum(input: DataSet[LabeledVector]): DataSet[Double] = {
weightsOption match {
case Some(weights) => {
input.mapWithBcVariable(weights){
(dataPoint, weights) => lossFunction.loss(dataPoint, weights)
}.reduce {
_ + _
}
}
case None => {
throw new RuntimeException("The MultipleLinearRegression has not been fitted to the " +
"data. This is necessary to learn the weight vector of the linear function.")
}
}
}
}
object MultipleLinearRegression {
val WEIGHTVECTOR_BROADCAST = "weights_broadcast"
val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction)
// ====================================== Parameters =============================================
case object Stepsize extends Parameter[Double] {
val defaultValue = Some(0.1)
}
case object Iterations extends Parameter[Int] {
val defaultValue = Some(10)
}
case object ConvergenceThreshold extends Parameter[Double] {
val defaultValue = None
}
// ======================================== Factory methods ======================================
def apply(): MultipleLinearRegression = {
new MultipleLinearRegression()
}
// ====================================== Operations =============================================
/** Trains the linear model to fit the training data. The resulting weight vector is stored in
* the [[MultipleLinearRegression]] instance.
*
*/
implicit val fitMLR = new FitOperation[MultipleLinearRegression, LabeledVector] {
override def fit(
instance: MultipleLinearRegression,
fitParameters: ParameterMap,
input: DataSet[LabeledVector])
: Unit = {
val map = instance.parameters ++ fitParameters
// retrieve parameters of the algorithm
val numberOfIterations = map(Iterations)
val stepsize = map(Stepsize)
val convergenceThreshold = map.get(ConvergenceThreshold)
val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction)
val optimizer = SimpleGradientDescent()
.setIterations(numberOfIterations)
.setStepsize(stepsize)
.setLossFunction(lossFunction)
convergenceThreshold match {
case Some(threshold) => optimizer.setConvergenceThreshold(threshold)
case None =>
}
instance.weightsOption = Some(optimizer.optimize(input, None))
}
}
implicit def predictVectors[T <: Vector] = {
new PredictOperation[MultipleLinearRegression, WeightVector, T, Double]() {
override def getModel(self: MultipleLinearRegression, predictParameters: ParameterMap)
: DataSet[WeightVector] = {
self.weightsOption match {
case Some(weights) => weights
case None => {
throw new RuntimeException("The MultipleLinearRegression has not been fitted to the " +
"data. This is necessary to learn the weight vector of the linear function.")
}
}
}
override def predict(value: T, model: WeightVector): Double = {
import Breeze._
val WeightVector(weights, weight0) = model
val dotProduct = value.asBreeze.dot(weights.asBreeze)
dotProduct + weight0
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy