Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/package org.apache.spark.ml.regression
import java.util.Localeimport breeze.stats.{distributions => dist}
import breeze.stats.distributions.Rand.FixedSeed.randBasis
import org.apache.commons.lang3.StringUtilsimport org.apache.hadoop.fs.Pathimport org.apache.spark.SparkExceptionimport org.apache.spark.annotation.Sinceimport org.apache.spark.internal.Loggingimport org.apache.spark.ml.PredictorParamsimport org.apache.spark.ml.attribute._
import org.apache.spark.ml.feature.{Instance, OffsetInstance}
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim._
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.ml.util.DatasetUtils._
import org.apache.spark.ml.util.Instrumentation.instrumented
import org.apache.spark.rdd.RDDimport org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
/**
* Params for Generalized Linear Regression.
*/private[regression] traitGeneralizedLinearRegressionBaseextendsPredictorParamswithHasFitInterceptwithHasMaxIterwithHasTolwithHasRegParamwithHasWeightColwithHasSolverwithHasAggregationDepthwithLogging {
importGeneralizedLinearRegression._
/**
* Param for the name of family which is a description of the error distribution
* to be used in the model.
* Supported options: "gaussian", "binomial", "poisson", "gamma" and "tweedie".
* Default is "gaussian".
*
* @group param
*/@Since("2.0.0")
finalval family: Param[String] = newParam(this, "family",
"The name of family which is a description of the error distribution to be used in the " +
s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.",
(value: String) => supportedFamilyNames.contains(value.toLowerCase(Locale.ROOT)))
/** @group getParam */@Since("2.0.0")
defgetFamily: String = $(family)
/**
* Param for the power in the variance function of the Tweedie distribution which provides
* the relationship between the variance and mean of the distribution.
* Only applicable to the Tweedie family.
* (see
* Tweedie Distribution (Wikipedia))
* Supported values: 0 and [1, Inf).
* Note that variance power 0, 1, or 2 corresponds to the Gaussian, Poisson or Gamma
* family, respectively.
*
* @group param
*/@Since("2.2.0")
finalval variancePower: DoubleParam = newDoubleParam(this, "variancePower",
"The power in the variance function of the Tweedie distribution which characterizes " +
"the relationship between the variance and mean of the distribution. " +
"Only applicable to the Tweedie family. Supported values: 0 and [1, Inf).",
(x: Double) => x >= 1.0 || x == 0.0)
/** @group getParam */@Since("2.2.0")
defgetVariancePower: Double = $(variancePower)
/**
* Param for the name of link function which provides the relationship
* between the linear predictor and the mean of the distribution function.
* Supported options: "identity", "log", "inverse", "logit", "probit", "cloglog" and "sqrt".
* This is used only when family is not "tweedie". The link function for the "tweedie" family
* must be specified through [[linkPower]].
*
* @group param
*/@Since("2.0.0")
finalval link: Param[String] = newParam(this, "link", "The name of link function " +
"which provides the relationship between the linear predictor and the mean of the " +
s"distribution function. Supported options: ${supportedLinkNames.mkString(", ")}",
(value: String) => supportedLinkNames.contains(value.toLowerCase(Locale.ROOT)))
/** @group getParam */@Since("2.0.0")
defgetLink: String = $(link)
/**
* Param for the index in the power link function. Only applicable to the Tweedie family.
* Note that link power 0, 1, -1 or 0.5 corresponds to the Log, Identity, Inverse or Sqrt
* link, respectively.
* When not set, this value defaults to 1 - [[variancePower]], which matches the R "statmod"
* package.
*
* @group param
*/@Since("2.2.0")
finalval linkPower: DoubleParam = newDoubleParam(this, "linkPower",
"The index in the power link function. Only applicable to the Tweedie family.")
/** @group getParam */@Since("2.2.0")
defgetLinkPower: Double = $(linkPower)
/**
* Param for link prediction (linear predictor) column name.
* Default is not set, which means we do not output link prediction.
*
* @group param
*/@Since("2.0.0")
finalval linkPredictionCol: Param[String] = newParam[String](this, "linkPredictionCol",
"link prediction (linear predictor) column name")
/** @group getParam */@Since("2.0.0")
defgetLinkPredictionCol: String = $(linkPredictionCol)
/**
* Param for offset column name. If this is not set or empty, we treat all instance offsets
* as 0.0. The feature specified as offset has a constant coefficient of 1.0.
*
* @group param
*/@Since("2.3.0")
finalval offsetCol: Param[String] = newParam[String](this, "offsetCol", "The offset " +
"column name. If this is not set or empty, we treat all instance offsets as 0.0")
/** @group getParam */@Since("2.3.0")
defgetOffsetCol: String = $(offsetCol)
/** Checks whether weight column is set and nonempty. */private[regression] defhasWeightCol: Boolean =
isSet(weightCol) && $(weightCol).nonEmpty
/** Checks whether offset column is set and nonempty. */private[regression] defhasOffsetCol: Boolean =
isSet(offsetCol) && $(offsetCol).nonEmpty
/** Checks whether we should output link prediction. */private[regression] defhasLinkPredictionCol: Boolean = {
isDefined(linkPredictionCol) && $(linkPredictionCol).nonEmpty
}
/**
* The solver algorithm for optimization.
* Supported options: "irls" (iteratively reweighted least squares).
* Default: "irls"
*
* @group param
*/@Since("2.0.0")
finaloverrideval solver: Param[String] = newParam[String](this, "solver",
"The solver algorithm for optimization. Supported options: " +
s"${supportedSolvers.mkString(", ")}. (Default irls)",
ParamValidators.inArray[String](supportedSolvers))
setDefault(family -> Gaussian.name, variancePower -> 0.0, maxIter -> 25, tol -> 1E-6,
regParam -> 0.0, solver -> IRLS)
@Since("2.0.0")
overridedefvalidateAndTransformSchema(
schema: StructType,
fitting: Boolean,
featuresDataType: DataType): StructType = {
if ($(family).toLowerCase(Locale.ROOT) == "tweedie") {
if (isSet(link)) {
logWarning("When family is tweedie, use param linkPower to specify link function. " +
"Setting param link will take no effect.")
}
} else {
if (isSet(variancePower)) {
logWarning("When family is not tweedie, setting param variancePower will take no effect.")
}
if (isSet(linkPower)) {
logWarning("When family is not tweedie, use param link to specify link function. " +
"Setting param linkPower will take no effect.")
}
if (isSet(link)) {
require(supportedFamilyAndLinkPairs.contains(
Family.fromParams(this) -> Link.fromParams(this)),
s"Generalized Linear Regression with ${$(family)} family " +
s"does not support ${$(link)} link function.")
}
}
val newSchema = super.validateAndTransformSchema(schema, fitting, featuresDataType)
if (hasOffsetCol) {
SchemaUtils.checkNumericType(schema, $(offsetCol))
}
if (hasLinkPredictionCol) {
val attr = NumericAttribute.defaultAttr
.withName($(linkPredictionCol))
SchemaUtils.appendColumn(newSchema, attr.toStructField())
} else {
newSchema
}
}
}
/**
* Fit a Generalized Linear Model
* (see
* Generalized linear model (Wikipedia))
* specified by giving a symbolic description of the linear
* predictor (link function) and a description of the error distribution (family).
* It supports "gaussian", "binomial", "poisson", "gamma" and "tweedie" as family.
* Valid link functions for each family is listed below. The first link function of each family
* is the default one.
* - "gaussian" : "identity", "log", "inverse"
* - "binomial" : "logit", "probit", "cloglog"
* - "poisson" : "log", "identity", "sqrt"
* - "gamma" : "inverse", "identity", "log"
* - "tweedie" : power link function specified through "linkPower". The default link power in
* the tweedie family is 1 - variancePower.
*/@Since("2.0.0")
classGeneralizedLinearRegression@Since("2.0.0") (@Since("2.0.0") overridevaluid: String)
extendsRegressor[Vector, GeneralizedLinearRegression, GeneralizedLinearRegressionModel]
withGeneralizedLinearRegressionBasewithDefaultParamsWritablewithLogging {
importGeneralizedLinearRegression._
@Since("2.0.0")
defthis() = this(Identifiable.randomUID("glm"))
/**
* Sets the value of param [[family]].
* Default is "gaussian".
*
* @group setParam
*/@Since("2.0.0")
defsetFamily(value: String): this.type = set(family, value)
/**
* Sets the value of param [[variancePower]].
* Used only when family is "tweedie".
* Default is 0.0, which corresponds to the "gaussian" family.
*
* @group setParam
*/@Since("2.2.0")
defsetVariancePower(value: Double): this.type = set(variancePower, value)
/**
* Sets the value of param [[linkPower]].
* Used only when family is "tweedie".
*
* @group setParam
*/@Since("2.2.0")
defsetLinkPower(value: Double): this.type = set(linkPower, value)
/**
* Sets the value of param [[link]].
* Used only when family is not "tweedie".
*
* @group setParam
*/@Since("2.0.0")
defsetLink(value: String): this.type = set(link, value)
/**
* Sets if we should fit the intercept.
* Default is true.
*
* @group setParam
*/@Since("2.0.0")
defsetFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
/**
* Sets the maximum number of iterations (applicable for solver "irls").
* Default is 25.
*
* @group setParam
*/@Since("2.0.0")
defsetMaxIter(value: Int): this.type = set(maxIter, value)
/**
* Sets the convergence tolerance of iterations.
* Smaller value will lead to higher accuracy with the cost of more iterations.
* Default is 1E-6.
*
* @group setParam
*/@Since("2.0.0")
defsetTol(value: Double): this.type = set(tol, value)
/**
* Sets the regularization parameter for L2 regularization.
* The regularization term is
*
* Default is 0.0.
*
* @group setParam
*/@Since("2.0.0")
defsetRegParam(value: Double): this.type = set(regParam, value)
/**
* Sets the value of param [[weightCol]].
* If this is not set or empty, we treat all instance weights as 1.0.
* Default is not set, so all instances have weight one.
* In the Binomial family, weights correspond to number of trials and should be integer.
* Non-integer weights are rounded to integer in AIC calculation.
*
* @group setParam
*/@Since("2.0.0")
defsetWeightCol(value: String): this.type = set(weightCol, value)
/**
* Sets the value of param [[offsetCol]].
* If this is not set or empty, we treat all instance offsets as 0.0.
* Default is not set, so all instances have offset 0.0.
*
* @group setParam
*/@Since("2.3.0")
defsetOffsetCol(value: String): this.type = set(offsetCol, value)
/**
* Sets the solver algorithm used for optimization.
* Currently only supports "irls" which is also the default solver.
*
* @group setParam
*/@Since("2.0.0")
defsetSolver(value: String): this.type = set(solver, value)
/**
* Sets the link prediction (linear predictor) column name.
*
* @group setParam
*/@Since("2.0.0")
defsetLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value)
/** @group expertSetParam */@Since("3.0.0")
defsetAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
overrideprotecteddeftrain(
dataset: Dataset[_]): GeneralizedLinearRegressionModel = instrumented { instr =>
val familyAndLink = FamilyAndLink(this)
instr.logPipelineStage(this)
instr.logDataset(dataset)
instr.logParams(this, labelCol, featuresCol, weightCol, offsetCol, predictionCol,
linkPredictionCol, family, solver, fitIntercept, link, maxIter, regParam, tol,
aggregationDepth)
val numFeatures = getNumFeatures(dataset, $(featuresCol))
instr.logNumFeatures(numFeatures)
if (numFeatures > WeightedLeastSquares.MAX_NUM_FEATURES) {
val msg = "Currently, GeneralizedLinearRegression only supports number of features" +
s" <= ${WeightedLeastSquares.MAX_NUM_FEATURES}. Found $numFeatures in the input dataset."thrownewSparkException(msg)
}
require(numFeatures > 0 || $(fitIntercept),
"GeneralizedLinearRegression was given data with 0 features, and with Param fitIntercept " +
"set to false. To fit a model with 0 features, fitIntercept must be set to true." )
val validated = dataset.select(
checkRegressionLabels($(labelCol)),
checkNonNegativeWeights(get(weightCol)),
if (!hasOffsetCol) lit(0.0) else checkNonNanValues($(offsetCol), "Offsets"),
checkNonNanVectors($(featuresCol))
)
val model = if (familyAndLink.family == Gaussian && familyAndLink.link == Identity) {
// TODO: Make standardizeFeatures and standardizeLabel configurable.val instances = validated.rdd.map {
caseRow(label: Double, weight: Double, offset: Double, features: Vector) =>
Instance(label - offset, weight, features)
}
val optimizer = newWeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0,
standardizeFeatures = true, standardizeLabel = true)
val wlsModel = optimizer.fit(instances, instr = OptionalInstrumentation.create(instr),
depth = $(aggregationDepth))
val model = copyValues(
newGeneralizedLinearRegressionModel(uid, wlsModel.coefficients, wlsModel.intercept)
.setParent(this))
val trainingSummary = newGeneralizedLinearRegressionTrainingSummary(dataset, model,
wlsModel.diagInvAtWA.toArray, 1, getSolver)
model.setSummary(Some(trainingSummary))
} else {
val instances = validated.rdd.map {
caseRow(label: Double, weight: Double, offset: Double, features: Vector) =>
OffsetInstance(label, weight, offset, features)
}
// Fit Generalized Linear Model by iteratively reweighted least squares (IRLS).val initialModel = familyAndLink.initialize(instances, $(fitIntercept), $(regParam),
instr = OptionalInstrumentation.create(instr), $(aggregationDepth))
val optimizer = newIterativelyReweightedLeastSquares(initialModel,
familyAndLink.reweightFunc, $(fitIntercept), $(regParam), $(maxIter), $(tol))
val irlsModel = optimizer.fit(instances, instr = OptionalInstrumentation.create(instr))
val model = copyValues(
newGeneralizedLinearRegressionModel(uid, irlsModel.coefficients, irlsModel.intercept)
.setParent(this))
val trainingSummary = newGeneralizedLinearRegressionTrainingSummary(dataset, model,
irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver)
model.setSummary(Some(trainingSummary))
}
model
}
@Since("2.0.0")
overridedefcopy(extra: ParamMap): GeneralizedLinearRegression = defaultCopy(extra)
}
@Since("2.0.0")
objectGeneralizedLinearRegressionextendsDefaultParamsReadable[GeneralizedLinearRegression] {
@Since("2.0.0")
overridedefload(path: String): GeneralizedLinearRegression = super.load(path)
/**
* Set of family (except for tweedie) and link pairs that GeneralizedLinearRegression supports.
* The link function of the Tweedie family is specified through param linkPower.
*/private[regression] lazyval supportedFamilyAndLinkPairs = Set(
Gaussian -> Identity, Gaussian -> Log, Gaussian -> Inverse,
Binomial -> Logit, Binomial -> Probit, Binomial -> CLogLog,
Poisson -> Log, Poisson -> Identity, Poisson -> Sqrt,
Gamma -> Inverse, Gamma -> Identity, Gamma -> Log
)
/** String name for "irls" (iteratively reweighted least squares) solver. */private[regression] valIRLS = "irls"/** Set of solvers that GeneralizedLinearRegression supports. */private[regression] val supportedSolvers = Array(IRLS)
/** Set of family names that GeneralizedLinearRegression supports. */private[regression] lazyval supportedFamilyNames =
supportedFamilyAndLinkPairs.map(_._1.name).toArray :+ "tweedie"/** Set of link names that GeneralizedLinearRegression supports. */private[regression] lazyval supportedLinkNames =
supportedFamilyAndLinkPairs.map(_._2.name).toArray
private[regression] val epsilon: Double = 1E-16private[regression] defylogy(y: Double, mu: Double): Double = {
if (y == 0) 0.0else y * math.log(y / mu)
}
/**
* Wrapper of family and link combination used in the model.
*/private[regression] classFamilyAndLink(val family: Family, val link: Link) extendsSerializable{
/** Linear predictor based on given mu. */defpredict(mu: Double): Double = link.link(family.project(mu))
/** Fitted value based on linear predictor eta. */deffitted(eta: Double): Double = family.project(link.unlink(eta))
/**
* Get the initial guess model for [[IterativelyReweightedLeastSquares]].
*/definitialize(
instances: RDD[OffsetInstance],
fitIntercept: Boolean,
regParam: Double,
instr: OptionalInstrumentation = OptionalInstrumentation.create(
classOf[GeneralizedLinearRegression]),
depth: Int = 2
): WeightedLeastSquaresModel = {
val newInstances = instances.map { instance =>
val mu = family.initialize(instance.label, instance.weight)
val eta = predict(mu) - instance.offset
Instance(eta, instance.weight, instance.features)
}
// TODO: Make standardizeFeatures and standardizeLabel configurable.val initialModel = newWeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0,
standardizeFeatures = true, standardizeLabel = true)
.fit(newInstances, instr, depth)
initialModel
}
/**
* The reweight function used to update working labels and weights
* at each iteration of [[IterativelyReweightedLeastSquares]].
*/defreweightFunc(
instance: OffsetInstance, model: WeightedLeastSquaresModel): (Double, Double) = {
val eta = model.predict(instance.features) + instance.offset
val mu = fitted(eta)
val newLabel = eta - instance.offset + (instance.label - mu) * link.deriv(mu)
val newWeight = instance.weight / (math.pow(this.link.deriv(mu), 2.0) * family.variance(mu))
(newLabel, newWeight)
}
}
private[regression] objectFamilyAndLink{
/**
* Constructs the FamilyAndLink object from a parameter map
*/defapply(params: GeneralizedLinearRegressionBase): FamilyAndLink = {
val familyObj = Family.fromParams(params)
val linkObj =
if ((params.getFamily.toLowerCase(Locale.ROOT) != "tweedie" &&
params.isSet(params.link)) ||
(params.getFamily.toLowerCase(Locale.ROOT) == "tweedie" &&
params.isSet(params.linkPower))) {
Link.fromParams(params)
} else {
familyObj.defaultLink
}
newFamilyAndLink(familyObj, linkObj)
}
}
/**
* A description of the error distribution to be used in the model.
*
* @param name the name of the family.
*/private[regression] abstractclassFamily(val name: String) extendsSerializable{
/** The default link instance of this family. */val defaultLink: Link/** Initialize the starting value for mu. */definitialize(y: Double, weight: Double): Double/** The variance of the endogenous variable's mean, given the value mu. */defvariance(mu: Double): Double/** Deviance of (y, mu) pair. */defdeviance(y: Double, mu: Double, weight: Double): Double/**
* Akaike Information Criterion (AIC) value of the family for a given dataset.
*
* @param predictions an RDD of (y, mu, weight) of instances in evaluation dataset
* @param deviance the deviance for the fitted model in evaluation dataset
* @param numInstances number of instances in evaluation dataset
* @param weightSum weights sum of instances in evaluation dataset
*/defaic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double/** Trim the fitted value so that it will be in valid range. */defproject(mu: Double): Double = mu
}
private[regression] objectFamily{
/**
* Gets the [[Family]] object based on param family and variancePower.
* If param family is set with "gaussian", "binomial", "poisson" or "gamma",
* return the corresponding object directly; otherwise, construct a Tweedie object
* according to variancePower.
*
* @param params the parameter map containing family name and variance power
*/deffromParams(params: GeneralizedLinearRegressionBase): Family = {
params.getFamily.toLowerCase(Locale.ROOT) match {
caseGaussian.name => GaussiancaseBinomial.name => BinomialcasePoisson.name => PoissoncaseGamma.name => Gammacase"tweedie" =>
params.getVariancePower match {
case0.0 => Gaussiancase1.0 => Poissoncase2.0 => Gammacase others => newTweedie(others)
}
}
}
}
/**
* Tweedie exponential family distribution.
* This includes the special cases of Gaussian, Poisson and Gamma.
*/private[regression] classTweedie(val variancePower: Double)extendsFamily("tweedie") {
overrideval defaultLink: Link = newPower(1.0 - variancePower)
overridedefinitialize(y: Double, weight: Double): Double = {
if (variancePower >= 1.0 && variancePower < 2.0) {
require(y >= 0.0, s"The response variable of $name($variancePower) family " +
s"should be non-negative, but got $y")
} elseif (variancePower >= 2.0) {
require(y > 0.0, s"The response variable of $name($variancePower) family " +
s"should be positive, but got $y")
}
if (y == 0) Tweedie.delta else y
}
overridedefvariance(mu: Double): Double = math.pow(mu, variancePower)
privatedefyp(y: Double, mu: Double, p: Double): Double = {
if (p == 0) {
math.log(y / mu)
} else {
(math.pow(y, p) - math.pow(mu, p)) / p
}
}
overridedefdeviance(y: Double, mu: Double, weight: Double): Double = {
// Force y >= delta for Poisson or compound Poissonval y1 = if (variancePower >= 1.0 && variancePower < 2.0) {
math.max(y, Tweedie.delta)
} else {
y
}
2.0 * weight *
(y * yp(y1, mu, 1.0 - variancePower) - yp(y, mu, 2.0 - variancePower))
}
overridedefaic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
/*
This depends on the density of the Tweedie distribution.
Only implemented for Gaussian, Poisson and Gamma at this point.
*/thrownewUnsupportedOperationException("No AIC available for the tweedie family")
}
overridedefproject(mu: Double): Double = {
if (mu < epsilon) {
epsilon
} elseif (mu.isInfinity) {
Double.MaxValue
} else {
mu
}
}
}
private[regression] objectTweedie{
/** Constant used in initialization and deviance to avoid numerical issues. */val delta: Double = 0.1
}
/**
* Gaussian exponential family distribution.
* The default link for the Gaussian family is the identity link.
*/private[regression] objectGaussianextendsTweedie(0.0) {
overrideval name: String = "gaussian"overrideval defaultLink: Link = Identityoverridedefinitialize(y: Double, weight: Double): Double = y
overridedefvariance(mu: Double): Double = 1.0overridedefdeviance(y: Double, mu: Double, weight: Double): Double = {
weight * (y - mu) * (y - mu)
}
overridedefaic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
val wt = predictions.map(x => math.log(x._3)).sum()
numInstances * (math.log(deviance / numInstances * 2.0 * math.Pi) + 1.0) + 2.0 - wt
}
overridedefproject(mu: Double): Double = {
if (mu.isNegInfinity) {
Double.MinValue
} elseif (mu.isPosInfinity) {
Double.MaxValue
} else {
mu
}
}
}
/**
* Binomial exponential family distribution.
* The default link for the Binomial family is the logit link.
*/private[regression] objectBinomialextendsFamily("binomial") {
val defaultLink: Link = Logitoverridedefinitialize(y: Double, weight: Double): Double = {
val mu = (weight * y + 0.5) / (weight + 1.0)
require(mu > 0.0 && mu < 1.0, "The response variable of Binomial family" +
s"should be in range (0, 1), but got $mu")
mu
}
overridedefvariance(mu: Double): Double = mu * (1.0 - mu)
overridedefdeviance(y: Double, mu: Double, weight: Double): Double = {
2.0 * weight * (ylogy(y, mu) + ylogy(1.0 - y, 1.0 - mu))
}
overridedefaic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
// weights for Binomial distribution correspond to number of trialsval wt = math.round(weight).toInt
if (wt == 0) {
0.0
} else {
dist.Binomial(wt, mu).logProbabilityOf(math.round(y * weight).toInt)
}
}.sum()
}
overridedefproject(mu: Double): Double = {
if (mu < epsilon) {
epsilon
} elseif (mu > 1.0 - epsilon) {
1.0 - epsilon
} else {
mu
}
}
}
/**
* Poisson exponential family distribution.
* The default link for the Poisson family is the log link.
*/private[regression] objectPoissonextendsTweedie(1.0) {
overrideval name: String = "poisson"overrideval defaultLink: Link = Logoverridedefinitialize(y: Double, weight: Double): Double = {
require(y >= 0.0, "The response variable of Poisson family " +
s"should be non-negative, but got $y")
/*
Force Poisson mean > 0 to avoid numerical instability in IRLS.
R uses y + delta for initialization. See poisson()$initialize.
*/
math.max(y, Tweedie.delta)
}
overridedefvariance(mu: Double): Double = mu
overridedefdeviance(y: Double, mu: Double, weight: Double): Double = {
2.0 * weight * (ylogy(y, mu) - (y - mu))
}
overridedefaic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
weight * dist.Poisson(mu).logProbabilityOf(y.toInt)
}.sum()
}
}
/**
* Gamma exponential family distribution.
* The default link for the Gamma family is the inverse link.
*/private[regression] objectGammaextendsTweedie(2.0) {
overrideval name: String = "gamma"overrideval defaultLink: Link = Inverseoverridedefinitialize(y: Double, weight: Double): Double = {
require(y > 0.0, "The response variable of Gamma family " +
s"should be positive, but got $y")
y
}
overridedefvariance(mu: Double): Double = mu * mu
overridedefdeviance(y: Double, mu: Double, weight: Double): Double = {
-2.0 * weight * (math.log(y / mu) - (y - mu)/mu)
}
overridedefaic(
predictions: RDD[(Double, Double, Double)],
deviance: Double,
numInstances: Double,
weightSum: Double): Double = {
val disp = deviance / weightSum
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
weight * dist.Gamma(1.0 / disp, mu * disp).logPdf(y)
}.sum() + 2.0
}
}
/**
* A description of the link function to be used in the model.
* The link function provides the relationship between the linear predictor
* and the mean of the distribution function.
*
* @param name the name of link function.
*/private[regression] abstractclassLink(val name: String) extendsSerializable{
/** The link function. */deflink(mu: Double): Double/** Derivative of the link function. */defderiv(mu: Double): Double/** The inverse link function. */defunlink(eta: Double): Double
}
private[regression] objectLink{
/**
* Gets the [[Link]] object based on param family, link and linkPower.
* If param family is set with "tweedie", return or construct link function object
* according to linkPower; otherwise, return link function object according to link.
*
* @param params the parameter map containing family, link and linkPower
*/deffromParams(params: GeneralizedLinearRegressionBase): Link = {
if (params.getFamily.toLowerCase(Locale.ROOT) == "tweedie") {
params.getLinkPower match {
case0.0 => Logcase1.0 => Identitycase-1.0 => Inversecase0.5 => Sqrtcase others => newPower(others)
}
} else {
params.getLink.toLowerCase(Locale.ROOT) match {
caseIdentity.name => IdentitycaseLogit.name => LogitcaseLog.name => LogcaseInverse.name => InversecaseProbit.name => ProbitcaseCLogLog.name => CLogLogcaseSqrt.name => Sqrt
}
}
}
}
/** Power link function class */private[regression] classPower(val linkPower: Double)extendsLink("power") {
overridedeflink(mu: Double): Double = {
if (linkPower == 0.0) {
math.log(mu)
} else {
math.pow(mu, linkPower)
}
}
overridedefderiv(mu: Double): Double = {
if (linkPower == 0.0) {
1.0 / mu
} else {
linkPower * math.pow(mu, linkPower - 1.0)
}
}
overridedefunlink(eta: Double): Double = {
if (linkPower == 0.0) {
math.exp(eta)
} else {
math.pow(eta, 1.0 / linkPower)
}
}
}
private[regression] objectIdentityextendsPower(1.0) {
overrideval name: String = "identity"overridedeflink(mu: Double): Double = mu
overridedefderiv(mu: Double): Double = 1.0overridedefunlink(eta: Double): Double = eta
}
private[regression] objectLogitextendsLink("logit") {
overridedeflink(mu: Double): Double = math.log(mu / (1.0 - mu))
overridedefderiv(mu: Double): Double = 1.0 / (mu * (1.0 - mu))
overridedefunlink(eta: Double): Double = 1.0 / (1.0 + math.exp(-1.0 * eta))
}
private[regression] objectLogextendsPower(0.0) {
overrideval name: String = "log"overridedeflink(mu: Double): Double = math.log(mu)
overridedefderiv(mu: Double): Double = 1.0 / mu
overridedefunlink(eta: Double): Double = math.exp(eta)
}
private[regression] objectInverseextendsPower(-1.0) {
overrideval name: String = "inverse"overridedeflink(mu: Double): Double = 1.0 / mu
overridedefderiv(mu: Double): Double = -1.0 * math.pow(mu, -2.0)
overridedefunlink(eta: Double): Double = 1.0 / eta
}
private[regression] objectProbitextendsLink("probit") {
overridedeflink(mu: Double): Double = dist.Gaussian(0.0, 1.0).inverseCdf(mu)
overridedefderiv(mu: Double): Double = {
1.0 / dist.Gaussian(0.0, 1.0).pdf(dist.Gaussian(0.0, 1.0).inverseCdf(mu))
}
overridedefunlink(eta: Double): Double = dist.Gaussian(0.0, 1.0).cdf(eta)
}
private[regression] objectCLogLogextendsLink("cloglog") {
overridedeflink(mu: Double): Double = math.log(-math.log1p(-mu))
overridedefderiv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log1p(-mu))
overridedefunlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta))
}
private[regression] objectSqrtextendsPower(0.5) {
overrideval name: String = "sqrt"overridedeflink(mu: Double): Double = math.sqrt(mu)
overridedefderiv(mu: Double): Double = 1.0 / (2.0 * math.sqrt(mu))
overridedefunlink(eta: Double): Double = eta * eta
}
}
/**
* Model produced by [[GeneralizedLinearRegression]].
*/@Since("2.0.0")
classGeneralizedLinearRegressionModelprivate[ml] (
@Since("2.0.0") overridevaluid: String,
@Since("2.0.0") val coefficients: Vector,
@Since("2.0.0") val intercept: Double)
extendsRegressionModel[Vector, GeneralizedLinearRegressionModel]
withGeneralizedLinearRegressionBasewithMLWritablewithHasTrainingSummary[GeneralizedLinearRegressionTrainingSummary] {
/**
* Sets the link prediction (linear predictor) column name.
*
* @group setParam
*/@Since("2.0.0")
defsetLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value)
importGeneralizedLinearRegression._
privatelazyval familyAndLink = FamilyAndLink(this)
overridedefpredict(features: Vector): Double = {
predict(features, 0.0)
}
/**
* Calculates the predicted value when offset is set.
*/privatedefpredict(features: Vector, offset: Double): Double = {
val eta = predictLink(features, offset)
familyAndLink.fitted(eta)
}
/**
* Calculates the link prediction (linear predictor) of the given instance.
*/privatedefpredictLink(features: Vector, offset: Double): Double = {
BLAS.dot(features, coefficients) + intercept + offset
}
overridedeftransform(dataset: Dataset[_]): DataFrame = {
transformSchema(dataset.schema)
transformImpl(dataset)
}
overrideprotecteddeftransformImpl(dataset: Dataset[_]): DataFrame = {
val outputSchema = transformSchema(dataset.schema, logging = true)
val offset = if (!hasOffsetCol) lit(0.0) else col($(offsetCol)).cast(DoubleType)
var outputData = dataset
var numColsOutput = 0if (hasLinkPredictionCol) {
val predLinkUDF = udf((features: Vector, offset: Double) => predictLink(features, offset))
outputData = outputData
.withColumn($(linkPredictionCol), predLinkUDF(col($(featuresCol)), offset),
outputSchema($(linkPredictionCol)).metadata)
numColsOutput += 1
}
if ($(predictionCol).nonEmpty) {
if (hasLinkPredictionCol) {
val predUDF = udf((eta: Double) => familyAndLink.fitted(eta))
outputData = outputData.withColumn($(predictionCol), predUDF(col($(linkPredictionCol))),
outputSchema($(predictionCol)).metadata)
} else {
val predUDF = udf((features: Vector, offset: Double) => predict(features, offset))
outputData = outputData.withColumn($(predictionCol), predUDF(col($(featuresCol)), offset),
outputSchema($(predictionCol)).metadata)
}
numColsOutput += 1
}
if (numColsOutput == 0) {
this.logWarning(s"$uid: GeneralizedLinearRegressionModel.transform() does nothing" +
" because no output columns were set.")
}
outputData.toDF
}
/**
* Gets R-like summary of model on training set. An exception is
* thrown if there is no summary available.
*/@Since("2.0.0")
overridedefsummary: GeneralizedLinearRegressionTrainingSummary = super.summary
/**
* Evaluate the model on the given dataset, returning a summary of the results.
*/@Since("2.0.0")
defevaluate(dataset: Dataset[_]): GeneralizedLinearRegressionSummary = {
newGeneralizedLinearRegressionSummary(dataset, this)
}
@Since("2.0.0")
overridedefcopy(extra: ParamMap): GeneralizedLinearRegressionModel = {
val copied = copyValues(newGeneralizedLinearRegressionModel(uid, coefficients, intercept),
extra)
copied.setSummary(trainingSummary).setParent(parent)
}
/**
* Returns a [[org.apache.spark.ml.util.MLWriter]] instance for this ML instance.
*
* For [[GeneralizedLinearRegressionModel]], this does NOT currently save the
* training [[summary]]. An option to save [[summary]] may be added in the future.
*
*/@Since("2.0.0")
overridedefwrite: MLWriter =
newGeneralizedLinearRegressionModel.GeneralizedLinearRegressionModelWriter(this)
overrideval numFeatures: Int = coefficients.size
@Since("3.0.0")
overridedeftoString: String = {
s"GeneralizedLinearRegressionModel: uid=$uid, family=${$(family)}, link=${$(link)}, " +
s"numFeatures=$numFeatures"
}
}
@Since("2.0.0")
objectGeneralizedLinearRegressionModelextendsMLReadable[GeneralizedLinearRegressionModel] {
@Since("2.0.0")
overridedefread: MLReader[GeneralizedLinearRegressionModel] =
newGeneralizedLinearRegressionModelReader@Since("2.0.0")
overridedefload(path: String): GeneralizedLinearRegressionModel = super.load(path)
/** [[MLWriter]] instance for [[GeneralizedLinearRegressionModel]] */private[GeneralizedLinearRegressionModel]
classGeneralizedLinearRegressionModelWriter(instance: GeneralizedLinearRegressionModel)extendsMLWriterwithLogging {
privatecaseclassData(intercept: Double, coefficients: Vector)overrideprotecteddefsaveImpl(path: String): Unit = {
// Save metadata and ParamsDefaultParamsWriter.saveMetadata(instance, path, sc)
// Save model data: intercept, coefficientsval data = Data(instance.intercept, instance.coefficients)
val dataPath = newPath(path, "data").toString
sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
}
}
privateclassGeneralizedLinearRegressionModelReaderextendsMLReader[GeneralizedLinearRegressionModel] {
/** Checked against metadata when loading model */privateval className = classOf[GeneralizedLinearRegressionModel].getName
overridedefload(path: String): GeneralizedLinearRegressionModel = {
val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
val dataPath = newPath(path, "data").toString
val data = sparkSession.read.parquet(dataPath)
.select("intercept", "coefficients").head()
val intercept = data.getDouble(0)
val coefficients = data.getAs[Vector](1)
val model = newGeneralizedLinearRegressionModel(metadata.uid, coefficients, intercept)
metadata.getAndSetParams(model)
model
}
}
}
/**
* Summary of [[GeneralizedLinearRegression]] model and predictions.
*
* @param dataset Dataset to be summarized.
* @param origModel Model to be summarized. This is copied to create an internal
* model which cannot be modified from outside.
*/@Since("2.0.0")
classGeneralizedLinearRegressionSummaryprivate[regression] (
dataset: Dataset[_],
origModel: GeneralizedLinearRegressionModel) extendsSerializable{
importGeneralizedLinearRegression._
/**
* Field in "predictions" which gives the predicted value of each instance.
* This is set to a new column name if the original model's `predictionCol` is not set.
*/@Since("2.0.0")
val predictionCol: String = {
if (origModel.isDefined(origModel.predictionCol) && origModel.getPredictionCol.nonEmpty) {
origModel.getPredictionCol
} else {
"prediction_" + java.util.UUID.randomUUID.toString
}
}
/**
* Private copy of model to ensure Params are not modified outside this class.
* Coefficients is not a deep copy, but that is acceptable.
*
* @note [[predictionCol]] must be set correctly before the value of [[model]] is set,
* and [[model]] must be set before [[predictions]] is set!
*/protectedval model: GeneralizedLinearRegressionModel =
origModel.copy(ParamMap.empty).setPredictionCol(predictionCol)
/**
* Predictions output by the model's `transform` method.
*/@Since("2.0.0") @transientval predictions: DataFrame = model.transform(dataset)
private[regression] lazyval familyLink: FamilyAndLink = FamilyAndLink(model)
private[regression] lazyval family: Family = familyLink.family
private[regression] lazyval link: Link = familyLink.link
/**
* summary row containing:
* numInstances, weightSum, deviance, rss, weighted average of label - offset.
*/privatelazyval glrSummary = {
val devUDF = udf { (label: Double, pred: Double, weight: Double) =>
family.deviance(label, pred, weight)
}
val devCol = sum(devUDF(label, prediction, weight))
val rssCol = if (model.getFamily.toLowerCase(Locale.ROOT) != Binomial.name &&
model.getFamily.toLowerCase(Locale.ROOT) != Poisson.name) {
val rssUDF = udf { (label: Double, pred: Double, weight: Double) =>
(label - pred) * (label - pred) * weight / family.variance(pred)
}
sum(rssUDF(label, prediction, weight))
} else {
lit(Double.NaN)
}
val avgCol = if (model.getFitIntercept &&
(!model.hasOffsetCol || (model.hasOffsetCol && family == Gaussian && link == Identity))) {
sum((label - offset) * weight) / sum(weight)
} else {
lit(Double.NaN)
}
predictions
.select(count(label), sum(weight), devCol, rssCol, avgCol)
.head()
}
/** Number of instances in DataFrame predictions. */@Since("2.2.0")
lazyval numInstances: Long = glrSummary.getLong(0)
/**
* Name of features. If the name cannot be retrieved from attributes,
* set default names to feature column name with numbered suffix "_0", "_1", and so on.
*/private[ml] lazyval featureNames: Array[String] = {
val featureAttrs = AttributeGroup.fromStructField(
dataset.schema(model.getFeaturesCol)).attributes
if (featureAttrs.isDefined) {
featureAttrs.get.map(_.name.get)
} else {
Array.tabulate[String](origModel.numFeatures)((x: Int) => model.getFeaturesCol + "_" + x)
}
}
/** The numeric rank of the fitted linear model. */@Since("2.0.0")
lazyval rank: Long = if (model.getFitIntercept) {
model.coefficients.size + 1
} else {
model.coefficients.size
}
/** Degrees of freedom. */@Since("2.0.0")
lazyval degreesOfFreedom: Long = numInstances - rank
/** The residual degrees of freedom. */@Since("2.0.0")
lazyval residualDegreeOfFreedom: Long = degreesOfFreedom
/** The residual degrees of freedom for the null model. */@Since("2.0.0")
lazyval residualDegreeOfFreedomNull: Long = {
if (model.getFitIntercept) numInstances - 1else numInstances
}
privatedeflabel: Column = col(model.getLabelCol).cast(DoubleType)
privatedefprediction: Column = col(predictionCol)
privatedefweight: Column = {
if (!model.hasWeightCol) lit(1.0) else col(model.getWeightCol)
}
privatedefoffset: Column = {
if (!model.hasOffsetCol) lit(0.0) else col(model.getOffsetCol).cast(DoubleType)
}
private[regression] lazyval devianceResiduals: DataFrame = {
val drUDF = udf { (y: Double, mu: Double, weight: Double) =>
val r = math.sqrt(math.max(family.deviance(y, mu, weight), 0.0))
if (y > mu) r else-1.0 * r
}
predictions.select(
drUDF(label, prediction, weight).as("devianceResiduals"))
}
private[regression] lazyval pearsonResiduals: DataFrame = {
val prUDF = udf { mu: Double => family.variance(mu) }
predictions.select(label.minus(prediction)
.multiply(sqrt(weight)).divide(sqrt(prUDF(prediction))).as("pearsonResiduals"))
}
private[regression] lazyval workingResiduals: DataFrame = {
val wrUDF = udf { (y: Double, mu: Double) => (y - mu) * link.deriv(mu) }
predictions.select(wrUDF(label, prediction).as("workingResiduals"))
}
private[regression] lazyval responseResiduals: DataFrame = {
predictions.select(label.minus(prediction).as("responseResiduals"))
}
/**
* Get the default residuals (deviance residuals) of the fitted model.
*/@Since("2.0.0")
defresiduals(): DataFrame = devianceResiduals
/**
* Get the residuals of the fitted model by type.
*
* @param residualsType The type of residuals which should be returned.
* Supported options: deviance, pearson, working and response.
*/@Since("2.0.0")
defresiduals(residualsType: String): DataFrame = {
residualsType match {
case"deviance" => devianceResiduals
case"pearson" => pearsonResiduals
case"working" => workingResiduals
case"response" => responseResiduals
case other => thrownewUnsupportedOperationException(
s"The residuals type $other is not supported by Generalized Linear Regression.")
}
}
/**
* The deviance for the null model.
*/@Since("2.0.0")
lazyval nullDeviance: Double = {
val intercept: Double = if (!model.getFitIntercept) {
0.0
} else {
/*
Estimate intercept analytically when there is no offset, or when there is offset but
the model is Gaussian family with identity link. Otherwise, fit an intercept only model.
*/if (!model.hasOffsetCol ||
(model.hasOffsetCol && family == Gaussian && link == Identity)) {
link.link(glrSummary.getDouble(4))
} else {
// Create empty feature column and fit intercept only model using param setting from modelval featureNull = "feature_" + java.util.UUID.randomUUID.toString
val paramMap = model.extractParamMap()
paramMap.put(model.featuresCol, featureNull)
if (family.name != "tweedie") {
paramMap.remove(model.variancePower)
}
val emptyVectorUDF = udf{ () => Vectors.zeros(0) }
model.parent.fit(
dataset.withColumn(featureNull, emptyVectorUDF()), paramMap
).intercept
}
}
predictions.select(label, offset, weight).rdd.map {
caseRow(y: Double, offset: Double, weight: Double) =>
family.deviance(y, link.unlink(intercept + offset), weight)
}.sum()
}
/**
* The deviance for the fitted model.
*/@Since("2.0.0")
lazyval deviance: Double = glrSummary.getDouble(2)
/**
* The dispersion of the fitted model.
* It is taken as 1.0 for the "binomial" and "poisson" families, and otherwise
* estimated by the residual Pearson's Chi-Squared statistic (which is defined as
* sum of the squares of the Pearson residuals) divided by the residual degrees of freedom.
*/@Since("2.0.0")
lazyval dispersion: Double = if (
model.getFamily.toLowerCase(Locale.ROOT) == Binomial.name ||
model.getFamily.toLowerCase(Locale.ROOT) == Poisson.name) {
1.0
} else {
val rss = glrSummary.getDouble(3)
rss / degreesOfFreedom
}
/** Akaike Information Criterion (AIC) for the fitted model. */@Since("2.0.0")
lazyval aic: Double = {
val weightSum = glrSummary.getDouble(1)
val t = predictions.select(
label, prediction, weight).rdd.map {
caseRow(label: Double, pred: Double, weight: Double) =>
(label, pred, weight)
}
family.aic(t, deviance, numInstances, weightSum) + 2 * rank
}
}
/**
* Summary of [[GeneralizedLinearRegression]] fitting and model.
*
* @param dataset Dataset to be summarized.
* @param origModel Model to be summarized. This is copied to create an internal
* model which cannot be modified from outside.
* @param diagInvAtWA diagonal of matrix (A^T * W * A)^-1 in the last iteration
* @param numIterations number of iterations
* @param solver the solver algorithm used for model training
*/@Since("2.0.0")
classGeneralizedLinearRegressionTrainingSummaryprivate[regression] (
dataset: Dataset[_],
origModel: GeneralizedLinearRegressionModel,
private val diagInvAtWA: Array[Double],
@Since("2.0.0") valnumIterations: Int,
@Since("2.0.0") val solver: String)
extendsGeneralizedLinearRegressionSummary(dataset, origModel) withSerializable {
importGeneralizedLinearRegression._
/**
* Whether the underlying `WeightedLeastSquares` using the "normal" solver.
*/private[ml] val isNormalSolver: Boolean = {
diagInvAtWA.length != 1 || diagInvAtWA(0) != 0
}
/**
* Standard error of estimated coefficients and intercept.
* This value is only available when the underlying `WeightedLeastSquares`
* using the "normal" solver.
*
* If `GeneralizedLinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*/@Since("2.0.0")
lazyval coefficientStandardErrors: Array[Double] = {
if (isNormalSolver) {
diagInvAtWA.map(_ * dispersion).map(math.sqrt)
} else {
thrownewUnsupportedOperationException(
"No Std. Error of coefficients available for this GeneralizedLinearRegressionModel")
}
}
/**
* T-statistic of estimated coefficients and intercept.
* This value is only available when the underlying `WeightedLeastSquares`
* using the "normal" solver.
*
* If `GeneralizedLinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*/@Since("2.0.0")
lazyval tValues: Array[Double] = {
if (isNormalSolver) {
val estimate = if (model.getFitIntercept) {
Array.concat(model.coefficients.toArray, Array(model.intercept))
} else {
model.coefficients.toArray
}
estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
} else {
thrownewUnsupportedOperationException(
"No t-statistic available for this GeneralizedLinearRegressionModel")
}
}
/**
* Two-sided p-value of estimated coefficients and intercept.
* This value is only available when the underlying `WeightedLeastSquares`
* using the "normal" solver.
*
* If `GeneralizedLinearRegression.fitIntercept` is set to true,
* then the last element returned corresponds to the intercept.
*/@Since("2.0.0")
lazyval pValues: Array[Double] = {
if (isNormalSolver) {
if (model.getFamily.toLowerCase(Locale.ROOT) == Binomial.name ||
model.getFamily.toLowerCase(Locale.ROOT) == Poisson.name) {
tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
} else {
tValues.map { x =>
2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x)))
}
}
} else {
thrownewUnsupportedOperationException(
"No p-value available for this GeneralizedLinearRegressionModel")
}
}
/**
* Coefficients with statistics: feature name, coefficients, standard error, tValue and pValue.
*/private[ml] lazyval coefficientsWithStatistics: Array[
(String, Double, Double, Double, Double)] = {
var featureNamesLocal = featureNames
var coefficientsArray = model.coefficients.toArray
var index = Array.range(0, coefficientsArray.length)
if (model.getFitIntercept) {
featureNamesLocal = featureNamesLocal :+ "(Intercept)"
coefficientsArray = coefficientsArray :+ model.intercept
// Reorder so that intercept comes first
index = (coefficientsArray.length - 1) +: index
}
index.map { i =>
(featureNamesLocal(i), coefficientsArray(i), coefficientStandardErrors(i),
tValues(i), pValues(i))
}
}
overridedeftoString: String = {
if (isNormalSolver) {
defround(x: Double): String = {
BigDecimal(x).setScale(4, BigDecimal.RoundingMode.HALF_UP).toString
}
val colNames = Array("Feature", "Estimate", "Std Error", "T Value", "P Value")
val data = coefficientsWithStatistics.map { row =>
val strRow = row.productIterator.map { cell =>
val str = cell match {
case s: String => s
case n: Double => round(n)
}
// Truncate if length > 20if (str.length > 20) {
str.substring(0, 17) + "..."
} else {
str
}
}
strRow.toArray
}
// Compute the width of each columnval colWidths = colNames.map(_.length)
data.foreach { strRow =>
strRow.zipWithIndex.foreach { case (cell: String, i: Int) =>
colWidths(i) = math.max(colWidths(i), cell.length)
}
}
val sb = newStringBuilder// Output coefficients with statistics
sb.append("Coefficients:\n")
colNames.zipWithIndex.map { case (colName: String, i: Int) =>
StringUtils.leftPad(colName, colWidths(i))
}.addString(sb, "", " ", "\n")
data.foreach { case strRow: Array[String] =>
strRow.zipWithIndex.map { case (cell: String, i: Int) =>
StringUtils.leftPad(cell, colWidths(i))
}.addString(sb, "", " ", "\n")
}
sb.append("\n")
sb.append(s"(Dispersion parameter for ${family.name} family taken to be " +
s"${round(dispersion)})")
sb.append("\n")
val nd = s"Null deviance: ${round(nullDeviance)} on $degreesOfFreedom degrees of freedom"val rd = s"Residual deviance: ${round(deviance)} on $residualDegreeOfFreedom degrees of " +
"freedom"val l = math.max(nd.length, rd.length)
sb.append(StringUtils.leftPad(nd, l))
sb.append("\n")
sb.append(StringUtils.leftPad(rd, l))
if (family.name != "tweedie") {
sb.append("\n")
sb.append(s"AIC: " + round(aic))
}
sb.toString()
} else {
thrownewUnsupportedOperationException(
"No summary available for this GeneralizedLinearRegressionModel")
}
}
}