All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tencent.angel.sona.ml.clustering.DistanceMeasure.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.tencent.angel.sona.ml.clustering

import org.apache.spark.linalg
import org.apache.spark.linalg.BLAS.{axpy, dot, scal}
import org.apache.spark.linalg.Vectors
import com.tencent.angel.sona.ml.util.MLUtils

private[sona] abstract class DistanceMeasure extends Serializable {

  /**
    * @return the index of the closest center to the given point, as well as the cost.
    */
  def findClosest(
                   centers: TraversableOnce[VectorWithNorm],
                   point: VectorWithNorm): (Int, Double) = {
    var bestDistance = Double.PositiveInfinity
    var bestIndex = 0
    var i = 0
    centers.foreach { center =>
      val currentDistance = distance(center, point)
      if (currentDistance < bestDistance) {
        bestDistance = currentDistance
        bestIndex = i
      }
      i += 1
    }
    (bestIndex, bestDistance)
  }

  /**
    * @return the K-means cost of a given point against the given cluster centers.
    */
  def pointCost(
                 centers: TraversableOnce[VectorWithNorm],
                 point: VectorWithNorm): Double = {
    findClosest(centers, point)._2
  }

  /**
    * @return whether a center converged or not, given the epsilon parameter.
    */
  def isCenterConverged(
                         oldCenter: VectorWithNorm,
                         newCenter: VectorWithNorm,
                         epsilon: Double): Boolean = {
    distance(oldCenter, newCenter) <= epsilon
  }

  /**
    * @return the distance between two points.
    */
  def distance(
                v1: VectorWithNorm,
                v2: VectorWithNorm): Double

  /**
    * @return the total cost of the cluster from its aggregated properties
    */
  def clusterCost(
                   centroid: VectorWithNorm,
                   pointsSum: VectorWithNorm,
                   numberOfPoints: Long,
                   pointsSquaredNorm: Double): Double

  /**
    * Updates the value of `sum` adding the `point` vector.
    *
    * @param point a `VectorWithNorm` to be added to `sum` of a cluster
    * @param sum   the `sum` for a cluster to be updated
    */
  def updateClusterSum(point: VectorWithNorm, sum: linalg.Vector): Unit = {
    axpy(1.0, point.vector, sum)
  }

  /**
    * Returns a centroid for a cluster given its `sum` vector and its `count` of points.
    *
    * @param sum   the `sum` for a cluster
    * @param count the number of points in the cluster
    * @return the centroid of the cluster
    */
  def centroid(sum: linalg.Vector, count: Long): VectorWithNorm = {
    scal(1.0 / count, sum)
    new VectorWithNorm(sum)
  }

  /**
    * Returns two new centroids symmetric to the specified centroid applying `noise` with the
    * with the specified `level`.
    *
    * @param level    the level of `noise` to apply to the given centroid.
    * @param noise    a noise vector
    * @param centroid the parent centroid
    * @return a left and right centroid symmetric to `centroid`
    */
  def symmetricCentroids(
                          level: Double,
                          noise: linalg.Vector,
                          centroid: linalg.Vector): (VectorWithNorm, VectorWithNorm) = {
    val left = centroid.copy
    axpy(-level, noise, left)
    val right = centroid.copy
    axpy(level, noise, right)
    (new VectorWithNorm(left), new VectorWithNorm(right))
  }

  /**
    * @return the cost of a point to be assigned to the cluster centroid
    */
  def cost(
            point: VectorWithNorm,
            centroid: VectorWithNorm): Double = distance(point, centroid)
}


private[clustering] class VectorWithNorm(val vector: linalg.Vector, val norm: Double)
  extends Serializable {

  def this(vector: linalg.Vector) = this(vector, Vectors.norm(vector, 2.0))

  def this(array: Array[Double]) = this(Vectors.dense(array))

  /** Converts the vector to a dense vector. */
  def toDense: VectorWithNorm = new VectorWithNorm(Vectors.dense(vector.toArray), norm)
}


object DistanceMeasure {


  val EUCLIDEAN = "euclidean"

  val COSINE = "cosine"

  private[sona] def decodeFromString(distanceMeasure: String): DistanceMeasure =
    distanceMeasure match {
      case EUCLIDEAN => new EuclideanDistanceMeasure
      case COSINE => new CosineDistanceMeasure
      case _ => throw new IllegalArgumentException(s"distanceMeasure must be one of: " +
        s"$EUCLIDEAN, $COSINE. $distanceMeasure provided.")
    }

  private[sona] def validateDistanceMeasure(distanceMeasure: String): Boolean = {
    distanceMeasure match {
      case DistanceMeasure.EUCLIDEAN => true
      case DistanceMeasure.COSINE => true
      case _ => false
    }
  }
}

private[sona] class EuclideanDistanceMeasure extends DistanceMeasure {
  /**
    * @return the index of the closest center to the given point, as well as the squared distance.
    */
  override def findClosest(
                            centers: TraversableOnce[VectorWithNorm],
                            point: VectorWithNorm): (Int, Double) = {
    var bestDistance = Double.PositiveInfinity
    var bestIndex = 0
    var i = 0
    centers.foreach { center =>
      // Since `\|a - b\| \geq |\|a\| - \|b\||`, we can use this lower bound to avoid unnecessary
      // distance computation.
      var lowerBoundOfSqDist = center.norm - point.norm
      lowerBoundOfSqDist = lowerBoundOfSqDist * lowerBoundOfSqDist
      if (lowerBoundOfSqDist < bestDistance) {
        val distance: Double = EuclideanDistanceMeasure.fastSquaredDistance(center, point)
        if (distance < bestDistance) {
          bestDistance = distance
          bestIndex = i
        }
      }
      i += 1
    }
    (bestIndex, bestDistance)
  }

  /**
    * @return whether a center converged or not, given the epsilon parameter.
    */
  override def isCenterConverged(
                                  oldCenter: VectorWithNorm,
                                  newCenter: VectorWithNorm,
                                  epsilon: Double): Boolean = {
    EuclideanDistanceMeasure.fastSquaredDistance(newCenter, oldCenter) <= epsilon * epsilon
  }

  /**
    * @param v1 : first vector
    * @param v2 : second vector
    * @return the Euclidean distance between the two input vectors
    */
  override def distance(v1: VectorWithNorm, v2: VectorWithNorm): Double = {
    Math.sqrt(EuclideanDistanceMeasure.fastSquaredDistance(v1, v2))
  }

  /**
    * @return the total cost of the cluster from its aggregated properties
    */
  override def clusterCost(
                            centroid: VectorWithNorm,
                            pointsSum: VectorWithNorm,
                            numberOfPoints: Long,
                            pointsSquaredNorm: Double): Double = {
    math.max(pointsSquaredNorm - numberOfPoints * centroid.norm * centroid.norm, 0.0)
  }

  /**
    * @return the cost of a point to be assigned to the cluster centroid
    */
  override def cost(
                     point: VectorWithNorm,
                     centroid: VectorWithNorm): Double = {
    EuclideanDistanceMeasure.fastSquaredDistance(point, centroid)
  }
}


private[sona] object EuclideanDistanceMeasure {
  /**
    * @return the squared Euclidean distance between two vectors computed by
    *         [[MLUtils#fastSquaredDistance]].
    */
  private[clustering] def fastSquaredDistance(
                                               v1: VectorWithNorm,
                                               v2: VectorWithNorm): Double = {
    MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm)
  }
}

private[sona] class CosineDistanceMeasure extends DistanceMeasure {
  /**
    * @param v1 : first vector
    * @param v2 : second vector
    * @return the cosine distance between the two input vectors
    */
  override def distance(v1: VectorWithNorm, v2: VectorWithNorm): Double = {
    assert(v1.norm > 0 && v2.norm > 0, "Cosine distance is not defined for zero-length vectors.")
    1 - dot(v1.vector, v2.vector) / v1.norm / v2.norm
  }

  /**
    * Updates the value of `sum` adding the `point` vector.
    *
    * @param point a `VectorWithNorm` to be added to `sum` of a cluster
    * @param sum   the `sum` for a cluster to be updated
    */
  override def updateClusterSum(point: VectorWithNorm, sum: linalg.Vector): Unit = {
    assert(point.norm > 0, "Cosine distance is not defined for zero-length vectors.")
    axpy(1.0 / point.norm, point.vector, sum)
  }

  /**
    * Returns a centroid for a cluster given its `sum` vector and its `count` of points.
    *
    * @param sum   the `sum` for a cluster
    * @param count the number of points in the cluster
    * @return the centroid of the cluster
    */
  override def centroid(sum: linalg.Vector, count: Long): VectorWithNorm = {
    scal(1.0 / count, sum)
    val norm = Vectors.norm(sum, 2)
    scal(1.0 / norm, sum)
    new VectorWithNorm(sum, 1)
  }

  /**
    * @return the total cost of the cluster from its aggregated properties
    */
  override def clusterCost(
                            centroid: VectorWithNorm,
                            pointsSum: VectorWithNorm,
                            numberOfPoints: Long,
                            pointsSquaredNorm: Double): Double = {
    val costVector = pointsSum.vector.copy
    math.max(numberOfPoints - dot(centroid.vector, costVector) / centroid.norm, 0.0)
  }

  /**
    * Returns two new centroids symmetric to the specified centroid applying `noise` with the
    * with the specified `level`.
    *
    * @param level    the level of `noise` to apply to the given centroid.
    * @param noise    a noise vector
    * @param centroid the parent centroid
    * @return a left and right centroid symmetric to `centroid`
    */
  override def symmetricCentroids(
                                   level: Double,
                                   noise: linalg.Vector,
                                   centroid: linalg.Vector): (VectorWithNorm, VectorWithNorm) = {
    val (left, right) = super.symmetricCentroids(level, noise, centroid)
    val leftVector = left.vector
    val rightVector = right.vector
    scal(1.0 / left.norm, leftVector)
    scal(1.0 / right.norm, rightVector)
    (new VectorWithNorm(leftVector, 1.0), new VectorWithNorm(rightVector, 1.0))
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy