com.tencent.angel.sona.ml.evaluation.ClusteringEvaluator.scala Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.tencent.angel.sona.ml.evaluation

import org.apache.spark.SparkContext
import com.tencent.angel.sona.ml.attribute.AttributeGroup
import org.apache.spark.linalg
import org.apache.spark.linalg.{BLAS, DenseVector, IntSparseVector, LongSparseVector, Vectors}
import com.tencent.angel.sona.ml.param.{Param, ParamMap, ParamValidators}
import com.tencent.angel.sona.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
import org.apache.spark.broadcast.Broadcast
import com.tencent.angel.sona.ml.util._
import org.apache.spark.sql.{Column, DataFrame, Dataset}
import org.apache.spark.sql.functions.{avg, col, udf}
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.util.SONASchemaUtils
import org.apache.spark.util.DatasetUtil


/**
 * :: Experimental ::
 *
 * Evaluator for clustering results.
 * The metric computes the Silhouette measure using the specified distance measure.
 *
 * The Silhouette is a measure for the validation of the consistency within clusters. It ranges
 * between 1 and -1, where a value close to 1 means that the points in a cluster are close to the
 * other points in the same cluster and far from the points of the other clusters.
 */

class ClusteringEvaluator(override val uid: String)
  extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable {


  def this() = this(Identifiable.randomUID("cluEval"))


  override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap)


  override def isLargerBetter: Boolean = true

  /** @group setParam */

  def setPredictionCol(value: String): this.type = set(predictionCol, value)

  /** @group setParam */

  def setFeaturesCol(value: String): this.type = set(featuresCol, value)

  /**
   * param for metric name in evaluation
   * (supports `"silhouette"` (default))
   * @group param
   */

  val metricName: Param[String] = {
    val allowedParams = ParamValidators.inArray(Array("silhouette"))
    new Param(
      this, "metricName", "metric name in evaluation (silhouette)", allowedParams)
  }

  /** @group getParam */

  def getMetricName: String = $(metricName)

  /** @group setParam */

  def setMetricName(value: String): this.type = set(metricName, value)

  /**
   * param for distance measure to be used in evaluation
   * (supports `"squaredEuclidean"` (default), `"cosine"`)
   * @group param
   */

  val distanceMeasure: Param[String] = {
    val availableValues = Array("squaredEuclidean", "cosine")
    val allowedParams = ParamValidators.inArray(availableValues)
    new Param(this, "distanceMeasure", "distance measure in evaluation. Supported options: " +
      availableValues.mkString("'", "', '", "'"), allowedParams)
  }

  /** @group getParam */

  def getDistanceMeasure: String = $(distanceMeasure)

  /** @group setParam */

  def setDistanceMeasure(value: String): this.type = set(distanceMeasure, value)

  setDefault(metricName -> "silhouette", distanceMeasure -> "squaredEuclidean")


  override def evaluate(dataset: Dataset[_]): Double = {
    SONASchemaUtils.validateVectorCompatibleColumn(dataset.schema, $(featuresCol))
    SONASchemaUtils.checkNumericType(dataset.schema, $(predictionCol))

    val vectorCol = DatasetUtil.columnToVector(dataset, $(featuresCol))
    val df = dataset.select(col($(predictionCol)),
      vectorCol.as($(featuresCol), dataset.schema($(featuresCol)).metadata))

    ($(metricName), $(distanceMeasure)) match {
      case ("silhouette", "squaredEuclidean") =>
        SquaredEuclideanSilhouette.computeSilhouetteScore(
          df, $(predictionCol), $(featuresCol))
      case ("silhouette", "cosine") =>
        CosineSilhouette.computeSilhouetteScore(df, $(predictionCol), $(featuresCol))
      case (mn, dm) =>
        throw new IllegalArgumentException(s"No support for metric $mn, distance $dm")
    }
  }
}



object ClusteringEvaluator
  extends DefaultParamsReadable[ClusteringEvaluator] {


  override def load(path: String): ClusteringEvaluator = super.load(path)

}


private[evaluation] abstract class Silhouette {

  /**
   * It computes the Silhouette coefficient for a point.
   */
  def pointSilhouetteCoefficient(
      clusterIds: Set[Double],
      pointClusterId: Double,
      pointClusterNumOfPoints: Long,
      averageDistanceToCluster: (Double) => Double): Double = {
    // Here we compute the average dissimilarity of the current point to any cluster of which the
    // point is not a member.
    // The cluster with the lowest average dissimilarity - i.e. the nearest cluster to the current
    // point - is said to be the "neighboring cluster".
    val otherClusterIds = clusterIds.filter(_ != pointClusterId)
    val neighboringClusterDissimilarity = otherClusterIds.map(averageDistanceToCluster).min

    // adjustment for excluding the node itself from the computation of the average dissimilarity
    val currentClusterDissimilarity = if (pointClusterNumOfPoints == 1) {
      0.0
    } else {
      averageDistanceToCluster(pointClusterId) * pointClusterNumOfPoints /
        (pointClusterNumOfPoints - 1)
    }

    if (currentClusterDissimilarity < neighboringClusterDissimilarity) {
      1 - (currentClusterDissimilarity / neighboringClusterDissimilarity)
    } else if (currentClusterDissimilarity > neighboringClusterDissimilarity) {
      (neighboringClusterDissimilarity / currentClusterDissimilarity) - 1
    } else {
      0.0
    }
  }

  /**
   * Compute the mean Silhouette values of all samples.
   */
  def overallScore(df: DataFrame, scoreColumn: Column): Double = {
    df.select(avg(scoreColumn)).collect()(0).getDouble(0)
  }

  protected def getNumberOfFeatures(dataFrame: DataFrame, columnName: String): Int = {
    val group = AttributeGroup.fromStructField(dataFrame.schema(columnName))
    if (group.size < 0) {
      dataFrame.select(col(columnName)).first().getAs[linalg.Vector](0).size.toInt
    } else {
      group.size.toInt
    }
  }
}

/**
 * SquaredEuclideanSilhouette computes the average of the
 * Silhouette over all the data of the dataset, which is
 * a measure of how appropriately the data have been clustered.
 *
 * The Silhouette for each point `i` is defined as:
 *
 * 
 *   $$
 *   s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}}
 *   $$
 * 
 *
 * which can be rewritten as
 *
 * 
 *   $$
 *   s_{i}= \begin{cases}
 *   1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\
 *   \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases}
 *   $$
 * 
 *
 * where `$a_{i}$` is the average dissimilarity of `i` with all other data
 * within the same cluster, `$b_{i}$` is the lowest average dissimilarity
 * of `i` to any other cluster, of which `i` is not a member.
 * `$a_{i}$` can be interpreted as how well `i` is assigned to its cluster
 * (the smaller the value, the better the assignment), while `$b_{i}$` is
 * a measure of how well `i` has not been assigned to its "neighboring cluster",
 * ie. the nearest cluster to `i`.
 *
 * Unfortunately, the naive implementation of the algorithm requires to compute
 * the distance of each couple of points in the dataset. Since the computation of
 * the distance measure takes `D` operations - if `D` is the number of dimensions
 * of each point, the computational complexity of the algorithm is `O(N^2^*D)`, where
 * `N` is the cardinality of the dataset. Of course this is not scalable in `N`,
 * which is the critical number in a Big Data context.
 *
 * The algorithm which is implemented in this object, instead, is an efficient
 * and parallel implementation of the Silhouette using the squared Euclidean
 * distance measure.
 *
 * With this assumption, the total distance of the point `X`
 * to the points `$C_{i}$` belonging to the cluster `$\Gamma$` is:
 *
 * 
 *   $$
 *   \sum\limits_{i=1}^N d(X, C_{i} ) =
 *   \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D (x_{j}-c_{ij})^2 \Big)
 *   = \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D x_{j}^2 +
 *   \sum\limits_{j=1}^D c_{ij}^2 -2\sum\limits_{j=1}^D x_{j}c_{ij} \Big)
 *   = \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 +
 *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2
 *   -2 \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}c_{ij}
 *   $$
 * 
 *
 * where `$x_{j}$` is the `j`-th dimension of the point `X` and
 * `$c_{ij}$` is the `j`-th dimension of the `i`-th point in cluster `$\Gamma$`.
 *
 * Then, the first term of the equation can be rewritten as:
 *
 * 
 *   $$
 *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 = N \xi_{X} \text{ ,
 *   with } \xi_{X} = \sum\limits_{j=1}^D x_{j}^2
 *   $$
 * 
 *
 * where `$\xi_{X}$` is fixed for each point and it can be precomputed.
 *
 * Moreover, the second term is fixed for each cluster too,
 * thus we can name it `$\Psi_{\Gamma}$`
 *
 * 
 *   $$
 *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 =
 *   \sum\limits_{i=1}^N \xi_{C_{i}} = \Psi_{\Gamma}
 *   $$
 * 
 *
 * Last, the third element becomes
 *
 * 
 *   $$
 *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}c_{ij} =
 *   \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{j}
 *   $$
 * 
 *
 * thus defining the vector
 *
 * 
 *   $$
 *   Y_{\Gamma}:Y_{\Gamma j} = \sum\limits_{i=1}^N c_{ij} , j=0, ..., D
 *   $$
 * 
 *
 * which is fixed for each cluster `$\Gamma$`, we have
 *
 * 
 *   $$
 *   \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{j} =
 *   \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}
 *   $$
 * 
 *
 * In this way, the previous equation becomes
 *
 * 
 *   $$
 *   N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}
 *   $$
 * 
 *
 * and the average distance of a point to a cluster can be computed as
 *
 * 
 *   $$
 *   \frac{\sum\limits_{i=1}^N d(X, C_{i} )}{N} =
 *   \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N} =
 *   \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N}
 *   $$
 * 
 *
 * Thus, it is enough to precompute: the constant `$\xi_{X}$` for each point `X`; the
 * constants `$\Psi_{\Gamma}$`, `N` and the vector `$Y_{\Gamma}$` for
 * each cluster `$\Gamma$`.
 *
 * In the implementation, the precomputed values for the clusters
 * are distributed among the worker nodes via broadcasted variables,
 * because we can assume that the clusters are limited in number and
 * anyway they are much fewer than the points.
 *
 * The main strengths of this algorithm are the low computational complexity
 * and the intrinsic parallelism. The precomputed information for each point
 * and for each cluster can be computed with a computational complexity
 * which is `O(N/W)`, where `N` is the number of points in the dataset and
 * `W` is the number of worker nodes. After that, every point can be
 * analyzed independently of the others.
 *
 * For every point we need to compute the average distance to all the clusters.
 * Since the formula above requires `O(D)` operations, this phase has a
 * computational complexity which is `O(C*D*N/W)` where `C` is the number of
 * clusters (which we assume quite low), `D` is the number of dimensions,
 * `N` is the number of points in the dataset and `W` is the number
 * of worker nodes.
 */
private[evaluation] object SquaredEuclideanSilhouette extends Silhouette {

  private[this] var kryoRegistrationPerformed: Boolean = false

  /**
   * This method registers the class
   * [[SquaredEuclideanSilhouette.ClusterStats]]
   * for kryo serialization.
   *
   * @param sc `SparkContext` to be used
   */
  def registerKryoClasses(sc: SparkContext): Unit = {
    if (!kryoRegistrationPerformed) {
      sc.getConf.registerKryoClasses(
        Array(
          classOf[SquaredEuclideanSilhouette.ClusterStats]
        )
      )
      kryoRegistrationPerformed = true
    }
  }

  case class ClusterStats(featureSum: linalg.Vector, squaredNormSum: Double, numOfPoints: Long)

  /**
   * The method takes the input dataset and computes the aggregated values
   * about a cluster which are needed by the algorithm.
   *
   * @param df The DataFrame which contains the input data
   * @param predictionCol The name of the column which contains the predicted cluster id
   *                      for the point.
   * @param featuresCol The name of the column which contains the feature vector of the point.
   * @return A [[scala.collection.immutable.Map]] which associates each cluster id
   *         to a [[ClusterStats]] object (which contains the precomputed values `N`,
   *         `$\Psi_{\Gamma}$` and `$Y_{\Gamma}$` for a cluster).
   */
  def computeClusterStats(
    df: DataFrame,
    predictionCol: String,
    featuresCol: String): Map[Double, ClusterStats] = {
    val numFeatures = getNumberOfFeatures(df, featuresCol)
    val clustersStatsRDD = df.select(
        col(predictionCol).cast(DoubleType), col(featuresCol), col("squaredNorm"))
      .rdd
      .map { row => (row.getDouble(0), (row.getAs[linalg.Vector](1), row.getDouble(2))) }
      .aggregateByKey[(DenseVector, Double, Long)]((Vectors.zeros(numFeatures).toDense, 0.0, 0L))(
        seqOp = {
          case (
              (featureSum: DenseVector, squaredNormSum: Double, numOfPoints: Long),
              (features, squaredNorm)
            ) =>
            BLAS.axpy(1.0, features, featureSum)
            (featureSum, squaredNormSum + squaredNorm, numOfPoints + 1)
        },
        combOp = {
          case (
              (featureSum1, squaredNormSum1, numOfPoints1),
              (featureSum2, squaredNormSum2, numOfPoints2)
            ) =>
            BLAS.axpy(1.0, featureSum2, featureSum1)
            (featureSum1, squaredNormSum1 + squaredNormSum2, numOfPoints1 + numOfPoints2)
        }
      )

    clustersStatsRDD
      .collectAsMap()
      .mapValues {
        case (featureSum: DenseVector, squaredNormSum: Double, numOfPoints: Long) =>
          SquaredEuclideanSilhouette.ClusterStats(featureSum, squaredNormSum, numOfPoints)
      }
      .toMap
  }

  /**
   * It computes the Silhouette coefficient for a point.
   *
   * @param broadcastedClustersMap A map of the precomputed values for each cluster.
   * @param point                  The [[linalg.Vector]] representing the current point.
   * @param clusterId              The id of the cluster the current point belongs to.
   * @param squaredNorm            The `$\Xi_{X}$` (which is the squared norm) precomputed for the point.
   * @return The Silhouette for the point.
   */
  def computeSilhouetteCoefficient(
                                    broadcastedClustersMap: Broadcast[Map[Double, ClusterStats]],
                                    point: linalg.Vector,
                                    clusterId: Double,
                                    squaredNorm: Double): Double = {

    def compute(targetClusterId: Double): Double = {
      val clusterStats = broadcastedClustersMap.value(targetClusterId)
      val pointDotClusterFeaturesSum = BLAS.dot(point, clusterStats.featureSum)

      squaredNorm +
        clusterStats.squaredNormSum / clusterStats.numOfPoints -
        2 * pointDotClusterFeaturesSum / clusterStats.numOfPoints
    }

    pointSilhouetteCoefficient(broadcastedClustersMap.value.keySet,
      clusterId,
      broadcastedClustersMap.value(clusterId).numOfPoints,
      compute)
  }

  /**
   * Compute the Silhouette score of the dataset using squared Euclidean distance measure.
   *
   * @param dataset The input dataset (previously clustered) on which compute the Silhouette.
   * @param predictionCol The name of the column which contains the predicted cluster id
   *                      for the point.
   * @param featuresCol The name of the column which contains the feature vector of the point.
   * @return The average of the Silhouette values of the clustered data.
   */
  def computeSilhouetteScore(
      dataset: Dataset[_],
      predictionCol: String,
      featuresCol: String): Double = {
    SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext)

    val squaredNormUDF = udf {
      features: linalg.Vector => math.pow(Vectors.norm(features, 2.0), 2.0)
    }
    val dfWithSquaredNorm = dataset.withColumn("squaredNorm", squaredNormUDF(col(featuresCol)))

    // compute aggregate values for clusters needed by the algorithm
    val clustersStatsMap = SquaredEuclideanSilhouette
      .computeClusterStats(dfWithSquaredNorm, predictionCol, featuresCol)

    // Silhouette is reasonable only when the number of clusters is greater then 1
    assert(clustersStatsMap.size > 1, "Number of clusters must be greater than one.")

    val bClustersStatsMap = dataset.sparkSession.sparkContext.broadcast(clustersStatsMap)

    val computeSilhouetteCoefficientUDF = udf {
      computeSilhouetteCoefficient(bClustersStatsMap, _: linalg.Vector, _: Double, _: Double)
    }

    val silhouetteScore = overallScore(dfWithSquaredNorm,
      computeSilhouetteCoefficientUDF(col(featuresCol), col(predictionCol).cast(DoubleType),
        col("squaredNorm")))

    bClustersStatsMap.destroy()

    silhouetteScore
  }
}


/**
 * The algorithm which is implemented in this object, instead, is an efficient and parallel
 * implementation of the Silhouette using the cosine distance measure. The cosine distance
 * measure is defined as `1 - s` where `s` is the cosine similarity between two points.
 *
 * The total distance of the point `X` to the points `$C_{i}$` belonging to the cluster `$\Gamma$`
 * is:
 *
 * 
 *   $$
 *   \sum\limits_{i=1}^N d(X, C_{i} ) =
 *   \sum\limits_{i=1}^N \Big( 1 - \frac{\sum\limits_{j=1}^D x_{j}c_{ij} }{ \|X\|\|C_{i}\|} \Big)
 *   = \sum\limits_{i=1}^N 1 - \sum\limits_{i=1}^N \sum\limits_{j=1}^D \frac{x_{j}}{\|X\|}
 *   \frac{c_{ij}}{\|C_{i}\|}
 *   = N - \sum\limits_{j=1}^D \frac{x_{j}}{\|X\|} \Big( \sum\limits_{i=1}^N
 *   \frac{c_{ij}}{\|C_{i}\|} \Big)
 *   $$
 * 
 *
 * where `$x_{j}$` is the `j`-th dimension of the point `X` and `$c_{ij}$` is the `j`-th dimension
 * of the `i`-th point in cluster `$\Gamma$`.
 *
 * Then, we can define the vector:
 *
 * 
 *   $$
 *   \xi_{X} : \xi_{X i} = \frac{x_{i}}{\|X\|}, i = 1, ..., D
 *   $$
 * 
 *
 * which can be precomputed for each point and the vector
 *
 * 
 *   $$
 *   \Omega_{\Gamma} : \Omega_{\Gamma i} = \sum\limits_{j=1}^N \xi_{C_{j}i}, i = 1, ..., D
 *   $$
 * 
 *
 * which can be precomputed too for each cluster `$\Gamma$` by its points `$C_{i}$`.
 *
 * With these definitions, the numerator becomes:
 *
 * 
 *   $$
 *   N - \sum\limits_{j=1}^D \xi_{X j} \Omega_{\Gamma j}
 *   $$
 * 
 *
 * Thus the average distance of a point `X` to the points of the cluster `$\Gamma$` is:
 *
 * 
 *   $$
 *   1 - \frac{\sum\limits_{j=1}^D \xi_{X j} \Omega_{\Gamma j}}{N}
 *   $$
 * 
 *
 * In the implementation, the precomputed values for the clusters are distributed among the worker
 * nodes via broadcasted variables, because we can assume that the clusters are limited in number.
 *
 * The main strengths of this algorithm are the low computational complexity and the intrinsic
 * parallelism. The precomputed information for each point and for each cluster can be computed
 * with a computational complexity which is `O(N/W)`, where `N` is the number of points in the
 * dataset and `W` is the number of worker nodes. After that, every point can be analyzed
 * independently from the others.
 *
 * For every point we need to compute the average distance to all the clusters. Since the formula
 * above requires `O(D)` operations, this phase has a computational complexity which is
 * `O(C*D*N/W)` where `C` is the number of clusters (which we assume quite low), `D` is the number
 * of dimensions, `N` is the number of points in the dataset and `W` is the number of worker
 * nodes.
 */
private[evaluation] object CosineSilhouette extends Silhouette {

  private[this] val normalizedFeaturesColName = "normalizedFeatures"

  /**
   * The method takes the input dataset and computes the aggregated values
   * about a cluster which are needed by the algorithm.
   *
   * @param df The DataFrame which contains the input data
   * @param predictionCol The name of the column which contains the predicted cluster id
   *                      for the point.
   * @return A [[scala.collection.immutable.Map]] which associates each cluster id to a
   *         its statistics (ie. the precomputed values `N` and `$\Omega_{\Gamma}$`).
   */
  def computeClusterStats(
      df: DataFrame,
      featuresCol: String,
      predictionCol: String): Map[Double, (linalg.Vector, Long)] = {
    val numFeatures = getNumberOfFeatures(df, featuresCol)
    val clustersStatsRDD = df.select(
      col(predictionCol).cast(DoubleType), col(normalizedFeaturesColName))
      .rdd
      .map { row => (row.getDouble(0), row.getAs[linalg.Vector](1)) }
      .aggregateByKey[(DenseVector, Long)]((Vectors.zeros(numFeatures).toDense, 0L))(
      seqOp = {
        case ((normalizedFeaturesSum: DenseVector, numOfPoints: Long), (normalizedFeatures)) =>
          BLAS.axpy(1.0, normalizedFeatures, normalizedFeaturesSum)
          (normalizedFeaturesSum, numOfPoints + 1)
      },
      combOp = {
        case ((normalizedFeaturesSum1, numOfPoints1), (normalizedFeaturesSum2, numOfPoints2)) =>
          BLAS.axpy(1.0, normalizedFeaturesSum2, normalizedFeaturesSum1)
          (normalizedFeaturesSum1, numOfPoints1 + numOfPoints2)
      }
    )

    clustersStatsRDD
      .collectAsMap()
      .toMap
  }

  /**
   * It computes the Silhouette coefficient for a point.
   *
   * @param broadcastedClustersMap A map of the precomputed values for each cluster.
   * @param normalizedFeatures     The [[linalg.Vector]] representing the
   *                               normalized features of the current point.
   * @param clusterId              The id of the cluster the current point belongs to.
   */
  def computeSilhouetteCoefficient(
                                    broadcastedClustersMap: Broadcast[Map[Double, (linalg.Vector, Long)]],
                                    normalizedFeatures: linalg.Vector,
                                    clusterId: Double): Double = {

    def compute(targetClusterId: Double): Double = {
      val (normalizedFeatureSum, numOfPoints) = broadcastedClustersMap.value(targetClusterId)
      1 - BLAS.dot(normalizedFeatures, normalizedFeatureSum) / numOfPoints
    }

    pointSilhouetteCoefficient(broadcastedClustersMap.value.keySet,
      clusterId,
      broadcastedClustersMap.value(clusterId)._2,
      compute)
  }

  /**
   * Compute the Silhouette score of the dataset using the cosine distance measure.
   *
   * @param dataset The input dataset (previously clustered) on which compute the Silhouette.
   * @param predictionCol The name of the column which contains the predicted cluster id
   *                      for the point.
   * @param featuresCol The name of the column which contains the feature vector of the point.
   * @return The average of the Silhouette values of the clustered data.
   */
  def computeSilhouetteScore(
      dataset: Dataset[_],
      predictionCol: String,
      featuresCol: String): Double = {
    val normalizeFeatureUDF = udf {
      features: linalg.Vector => {
        val norm = Vectors.norm(features, 2.0)
        features match {
          case d: DenseVector => Vectors.dense(d.values.map(_ / norm))
          case s: IntSparseVector => Vectors.sparse(s.size.toInt, s.indices, s.values.map(_ / norm))
          case s: LongSparseVector => Vectors.sparse(s.size.toInt, s.indices, s.values.map(_ / norm))
          case _ => throw new Exception("Vector Type Error!")
        }
      }
    }
    val dfWithNormalizedFeatures = dataset.withColumn(normalizedFeaturesColName,
      normalizeFeatureUDF(col(featuresCol)))

    // compute aggregate values for clusters needed by the algorithm
    val clustersStatsMap = computeClusterStats(dfWithNormalizedFeatures, featuresCol,
      predictionCol)

    // Silhouette is reasonable only when the number of clusters is greater then 1
    assert(clustersStatsMap.size > 1, "Number of clusters must be greater than one.")

    val bClustersStatsMap = dataset.sparkSession.sparkContext.broadcast(clustersStatsMap)

    val computeSilhouetteCoefficientUDF = udf {
      computeSilhouetteCoefficient(bClustersStatsMap, _: linalg.Vector, _: Double)
    }

    val silhouetteScore = overallScore(dfWithNormalizedFeatures,
      computeSilhouetteCoefficientUDF(col(normalizedFeaturesColName),
        col(predictionCol).cast(DoubleType)))

    bClustersStatsMap.destroy()

    silhouetteScore
  }
}