Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.ml.clustering
import scala.collection.mutable
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.Since
import org.apache.spark.ml.{Estimator, Model, PipelineStage}
import org.apache.spark.ml.functions.checkNonNegativeWeight
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.ml.util.Instrumentation.instrumented
import org.apache.spark.mllib.clustering.{DistanceMeasure, KMeans => MLlibKMeans, KMeansModel => MLlibKMeansModel}
import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
import org.apache.spark.mllib.linalg.VectorImplicits._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.VersionUtils.majorVersion
/**
* Common params for KMeans and KMeansModel
*/
private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFeaturesCol
with HasSeed with HasPredictionCol with HasTol with HasDistanceMeasure with HasWeightCol {
/**
* The number of clusters to create (k). Must be > 1. Note that it is possible for fewer than
* k clusters to be returned, for example, if there are fewer than k distinct points to cluster.
* Default: 2.
* @group param
*/
@Since("1.5.0")
final val k = new IntParam(this, "k", "The number of clusters to create. " +
"Must be > 1.", ParamValidators.gt(1))
/** @group getParam */
@Since("1.5.0")
def getK: Int = $(k)
/**
* Param for the initialization algorithm. This can be either "random" to choose random points as
* initial cluster centers, or "k-means||" to use a parallel variant of k-means++
* (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
* @group expertParam
*/
@Since("1.5.0")
final val initMode = new Param[String](this, "initMode", "The initialization algorithm. " +
"Supported options: 'random' and 'k-means||'.",
(value: String) => MLlibKMeans.validateInitMode(value))
/** @group expertGetParam */
@Since("1.5.0")
def getInitMode: String = $(initMode)
/**
* Param for the number of steps for the k-means|| initialization mode. This is an advanced
* setting -- the default of 2 is almost always enough. Must be > 0. Default: 2.
* @group expertParam
*/
@Since("1.5.0")
final val initSteps = new IntParam(this, "initSteps", "The number of steps for k-means|| " +
"initialization mode. Must be > 0.", ParamValidators.gt(0))
/** @group expertGetParam */
@Since("1.5.0")
def getInitSteps: Int = $(initSteps)
setDefault(k -> 2, maxIter -> 20, initMode -> MLlibKMeans.K_MEANS_PARALLEL, initSteps -> 2,
tol -> 1e-4, distanceMeasure -> DistanceMeasure.EUCLIDEAN)
/**
* Validates and transforms the input schema.
* @param schema input schema
* @return output schema
*/
protected def validateAndTransformSchema(schema: StructType): StructType = {
SchemaUtils.validateVectorCompatibleColumn(schema, getFeaturesCol)
SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType)
}
}
/**
* Model fitted by KMeans.
*
* @param parentModel a model trained by spark.mllib.clustering.KMeans.
*/
@Since("1.5.0")
class KMeansModel private[ml] (
@Since("1.5.0") override val uid: String,
private[clustering] val parentModel: MLlibKMeansModel)
extends Model[KMeansModel] with KMeansParams with GeneralMLWritable
with HasTrainingSummary[KMeansSummary] {
@Since("3.0.0")
lazy val numFeatures: Int = parentModel.clusterCenters.head.size
@Since("1.5.0")
override def copy(extra: ParamMap): KMeansModel = {
val copied = copyValues(new KMeansModel(uid, parentModel), extra)
copied.setSummary(trainingSummary).setParent(this.parent)
}
/** @group setParam */
@Since("2.0.0")
def setFeaturesCol(value: String): this.type = set(featuresCol, value)
/** @group setParam */
@Since("2.0.0")
def setPredictionCol(value: String): this.type = set(predictionCol, value)
@Since("2.0.0")
override def transform(dataset: Dataset[_]): DataFrame = {
val outputSchema = transformSchema(dataset.schema, logging = true)
val predictUDF = udf((vector: Vector) => predict(vector))
dataset.withColumn($(predictionCol),
predictUDF(DatasetUtils.columnToVector(dataset, getFeaturesCol)),
outputSchema($(predictionCol)).metadata)
}
@Since("1.5.0")
override def transformSchema(schema: StructType): StructType = {
var outputSchema = validateAndTransformSchema(schema)
if ($(predictionCol).nonEmpty) {
outputSchema = SchemaUtils.updateNumValues(outputSchema,
$(predictionCol), parentModel.k)
}
outputSchema
}
@Since("3.0.0")
def predict(features: Vector): Int = parentModel.predict(features)
@Since("2.0.0")
def clusterCenters: Array[Vector] = parentModel.clusterCenters.map(_.asML)
/**
* Returns a [[org.apache.spark.ml.util.GeneralMLWriter]] instance for this ML instance.
*
* For [[KMeansModel]], this does NOT currently save the training [[summary]].
* An option to save [[summary]] may be added in the future.
*
*/
@Since("1.6.0")
override def write: GeneralMLWriter = new GeneralMLWriter(this)
@Since("3.0.0")
override def toString: String = {
s"KMeansModel: uid=$uid, k=${parentModel.k}, distanceMeasure=${$(distanceMeasure)}, " +
s"numFeatures=$numFeatures"
}
/**
* Gets summary of model on training set. An exception is
* thrown if `hasSummary` is false.
*/
@Since("2.0.0")
override def summary: KMeansSummary = super.summary
}
/** Helper class for storing model data */
private case class ClusterData(clusterIdx: Int, clusterCenter: Vector)
/** A writer for KMeans that handles the "internal" (or default) format */
private class InternalKMeansModelWriter extends MLWriterFormat with MLFormatRegister {
override def format(): String = "internal"
override def stageName(): String = "org.apache.spark.ml.clustering.KMeansModel"
override def write(path: String, sparkSession: SparkSession,
optionMap: mutable.Map[String, String], stage: PipelineStage): Unit = {
val instance = stage.asInstanceOf[KMeansModel]
val sc = sparkSession.sparkContext
// Save metadata and Params
DefaultParamsWriter.saveMetadata(instance, path, sc)
// Save model data: cluster centers
val data: Array[ClusterData] = instance.clusterCenters.zipWithIndex.map {
case (center, idx) =>
ClusterData(idx, center)
}
val dataPath = new Path(path, "data").toString
sparkSession.createDataFrame(data).repartition(1).write.parquet(dataPath)
}
}
/** A writer for KMeans that handles the "pmml" format */
private class PMMLKMeansModelWriter extends MLWriterFormat with MLFormatRegister {
override def format(): String = "pmml"
override def stageName(): String = "org.apache.spark.ml.clustering.KMeansModel"
override def write(path: String, sparkSession: SparkSession,
optionMap: mutable.Map[String, String], stage: PipelineStage): Unit = {
val instance = stage.asInstanceOf[KMeansModel]
val sc = sparkSession.sparkContext
instance.parentModel.toPMML(sc, path)
}
}
@Since("1.6.0")
object KMeansModel extends MLReadable[KMeansModel] {
@Since("1.6.0")
override def read: MLReader[KMeansModel] = new KMeansModelReader
@Since("1.6.0")
override def load(path: String): KMeansModel = super.load(path)
/**
* We store all cluster centers in a single row and use this class to store model data by
* Spark 1.6 and earlier. A model can be loaded from such older data for backward compatibility.
*/
private case class OldData(clusterCenters: Array[OldVector])
private class KMeansModelReader extends MLReader[KMeansModel] {
/** Checked against metadata when loading model */
private val className = classOf[KMeansModel].getName
override def load(path: String): KMeansModel = {
// Import implicits for Dataset Encoder
val sparkSession = super.sparkSession
import sparkSession.implicits._
val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
val dataPath = new Path(path, "data").toString
val clusterCenters = if (majorVersion(metadata.sparkVersion) >= 2) {
val data: Dataset[ClusterData] = sparkSession.read.parquet(dataPath).as[ClusterData]
data.collect().sortBy(_.clusterIdx).map(_.clusterCenter).map(OldVectors.fromML)
} else {
// Loads KMeansModel stored with the old format used by Spark 1.6 and earlier.
sparkSession.read.parquet(dataPath).as[OldData].head().clusterCenters
}
val model = new KMeansModel(metadata.uid, new MLlibKMeansModel(clusterCenters))
metadata.getAndSetParams(model)
model
}
}
}
/**
* K-means clustering with support for k-means|| initialization proposed by Bahmani et al.
*
* @see Bahmani et al., Scalable k-means++.
*/
@Since("1.5.0")
class KMeans @Since("1.5.0") (
@Since("1.5.0") override val uid: String)
extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {
@Since("1.5.0")
override def copy(extra: ParamMap): KMeans = defaultCopy(extra)
@Since("1.5.0")
def this() = this(Identifiable.randomUID("kmeans"))
/** @group setParam */
@Since("1.5.0")
def setFeaturesCol(value: String): this.type = set(featuresCol, value)
/** @group setParam */
@Since("1.5.0")
def setPredictionCol(value: String): this.type = set(predictionCol, value)
/** @group setParam */
@Since("1.5.0")
def setK(value: Int): this.type = set(k, value)
/** @group expertSetParam */
@Since("1.5.0")
def setInitMode(value: String): this.type = set(initMode, value)
/** @group expertSetParam */
@Since("2.4.0")
def setDistanceMeasure(value: String): this.type = set(distanceMeasure, value)
/** @group expertSetParam */
@Since("1.5.0")
def setInitSteps(value: Int): this.type = set(initSteps, value)
/** @group setParam */
@Since("1.5.0")
def setMaxIter(value: Int): this.type = set(maxIter, value)
/** @group setParam */
@Since("1.5.0")
def setTol(value: Double): this.type = set(tol, value)
/** @group setParam */
@Since("1.5.0")
def setSeed(value: Long): this.type = set(seed, value)
/**
* Sets the value of param [[weightCol]].
* If this is not set or empty, we treat all instance weights as 1.0.
* Default is not set, so all instances have weight one.
*
* @group setParam
*/
@Since("3.0.0")
def setWeightCol(value: String): this.type = set(weightCol, value)
@Since("2.0.0")
override def fit(dataset: Dataset[_]): KMeansModel = instrumented { instr =>
transformSchema(dataset.schema, logging = true)
instr.logPipelineStage(this)
instr.logDataset(dataset)
instr.logParams(this, featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure,
maxIter, seed, tol, weightCol)
val algo = new MLlibKMeans()
.setK($(k))
.setInitializationMode($(initMode))
.setInitializationSteps($(initSteps))
.setMaxIterations($(maxIter))
.setSeed($(seed))
.setEpsilon($(tol))
.setDistanceMeasure($(distanceMeasure))
val w = if (isDefined(weightCol) && $(weightCol).nonEmpty) {
checkNonNegativeWeight(col($(weightCol)).cast(DoubleType))
} else {
lit(1.0)
}
val instances = dataset.select(DatasetUtils.columnToVector(dataset, getFeaturesCol), w)
.rdd.map { case Row(point: Vector, weight: Double) => (OldVectors.fromML(point), weight) }
val handlePersistence = dataset.storageLevel == StorageLevel.NONE
val parentModel = algo.runWithWeight(instances, handlePersistence, Some(instr))
val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
val summary = new KMeansSummary(
model.transform(dataset),
$(predictionCol),
$(featuresCol),
$(k),
parentModel.numIter,
parentModel.trainingCost)
model.setSummary(Some(summary))
instr.logNamedValue("clusterSizes", summary.clusterSizes)
model
}
@Since("1.5.0")
override def transformSchema(schema: StructType): StructType = {
validateAndTransformSchema(schema)
}
}
@Since("1.6.0")
object KMeans extends DefaultParamsReadable[KMeans] {
@Since("1.6.0")
override def load(path: String): KMeans = super.load(path)
}
/**
* Summary of KMeans.
*
* @param predictions `DataFrame` produced by `KMeansModel.transform()`.
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
* @param numIter Number of iterations.
* @param trainingCost K-means cost (sum of squared distances to the nearest centroid for all
* points in the training dataset). This is equivalent to sklearn's inertia.
*/
@Since("2.0.0")
class KMeansSummary private[clustering] (
predictions: DataFrame,
predictionCol: String,
featuresCol: String,
k: Int,
numIter: Int,
@Since("2.4.0") val trainingCost: Double)
extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)