org.apache.spark.examples.ml.LinearRegressionExample.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// scalastyle:off println
package org.apache.spark.examples.ml
import scopt.OptionParser
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* An example runner for linear regression with elastic-net (mixing L1/L2) regularization.
* Run with
* {{{
* bin/run-example ml.LinearRegressionExample [options]
* }}}
* A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt` which can be
* trained by
* {{{
* bin/run-example ml.LinearRegressionExample --regParam 0.15 --elasticNetParam 1.0 \
* data/mllib/sample_linear_regression_data.txt
* }}}
* If you use it as a template to create your own app, please use `spark-submit` to submit your app.
*/
object LinearRegressionExample {
case class Params(
input: String = null,
testInput: String = "",
dataFormat: String = "libsvm",
regParam: Double = 0.0,
elasticNetParam: Double = 0.0,
maxIter: Int = 100,
tol: Double = 1E-6,
fracTest: Double = 0.2) extends AbstractParams[Params]
def main(args: Array[String]): Unit = {
val defaultParams = Params()
val parser = new OptionParser[Params]("LinearRegressionExample") {
head("LinearRegressionExample: an example Linear Regression with Elastic-Net app.")
opt[Double]("regParam")
.text(s"regularization parameter, default: ${defaultParams.regParam}")
.action((x, c) => c.copy(regParam = x))
opt[Double]("elasticNetParam")
.text(s"ElasticNet mixing parameter. For alpha = 0, the penalty is an L2 penalty. " +
s"For alpha = 1, it is an L1 penalty. For 0 < alpha < 1, the penalty is a combination of " +
s"L1 and L2, default: ${defaultParams.elasticNetParam}")
.action((x, c) => c.copy(elasticNetParam = x))
opt[Int]("maxIter")
.text(s"maximum number of iterations, default: ${defaultParams.maxIter}")
.action((x, c) => c.copy(maxIter = x))
opt[Double]("tol")
.text(s"the convergence tolerance of iterations, Smaller value will lead " +
s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
.action((x, c) => c.copy(tol = x))
opt[Double]("fracTest")
.text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[String]("testInput")
.text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
.text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
.action((x, c) => c.copy(dataFormat = x))
arg[String]("")
.text("input path to labeled examples")
.required()
.action((x, c) => c.copy(input = x))
checkConfig { params =>
if (params.fracTest < 0 || params.fracTest >= 1) {
failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).")
} else {
success
}
}
}
parser.parse(args, defaultParams) match {
case Some(params) => run(params)
case _ => sys.exit(1)
}
}
def run(params: Params): Unit = {
val spark = SparkSession
.builder
.appName(s"LinearRegressionExample with $params")
.getOrCreate()
println(s"LinearRegressionExample with parameters:\n$params")
// Load training and test data and cache it.
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
params.dataFormat, params.testInput, "regression", params.fracTest)
val lir = new LinearRegression()
.setFeaturesCol("features")
.setLabelCol("label")
.setRegParam(params.regParam)
.setElasticNetParam(params.elasticNetParam)
.setMaxIter(params.maxIter)
.setTol(params.tol)
// Train the model
val startTime = System.nanoTime()
val lirModel = lir.fit(training)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
// Print the weights and intercept for linear regression.
println(s"Weights: ${lirModel.coefficients} Intercept: ${lirModel.intercept}")
println("Training data results:")
DecisionTreeExample.evaluateRegressionModel(lirModel, training, "label")
println("Test data results:")
DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label")
spark.stop()
}
}
// scalastyle:on println
© 2015 - 2025 Weber Informatics LLC | Privacy Policy