All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.examples.KMeans.scala Maven / Gradle / Ivy

package com.twitter.scalding.examples

import com.twitter.scalding._
import com.twitter.scalding.typed.ComputedValue

object KMeans {

  /**
   * This is the euclidean norm between two vectors
   */
  private def distance(v1: Vector[Double], v2: Vector[Double]): Double =
    math.sqrt(v1.iterator
      .zip(v2.iterator)
      .map { case (l, r) => (l - r) * (l - r) }
      .sum)

  // Just normal vector addition
  private def add(v1: Vector[Double], v2: Vector[Double]): Vector[Double] =
    v1.zip(v2).map { case (l, r) => l + r }

  // normal scalar multiplication
  private def scale(s: Double, v: Vector[Double]): Vector[Double] =
    v.map { x => s * x }

  // Here we return the centroid of some vectors
  private def centroidOf(vecs: TraversableOnce[Vector[Double]]): Vector[Double] = {
    val (vec, count) = vecs
      // add a 1 to each value to count the number of vectors in one pass:
      .map { v => (v, 1) }
      // Here we add both the count and the vectors:
      .reduce { (ll, rr) =>
        val (l, lc) = ll
        val (r, rc) = rr
        (add(l, r), lc + rc)
      }
    // Now scale to get the pointwise average
    scale(1.0 / count, vec)
  }

  private def closest[Id](from: Vector[Double],
    centroids: TraversableOnce[(Id, Vector[Double])]): (Id, Vector[Double]) =
    centroids
      // compute the distance to each center
      .map { case (id, cent) => (distance(from, cent), (id, cent)) }
      // take the minimum by the distance, ignoring the id and the centroid
      .minBy { case (dist, _) => dist }
      // Just keep the id and the centroid
      ._2

  type LabeledVector = (Int, Vector[Double])

  /**
   * This runs one step in a kmeans algorithm
   * It returns the number of vectors that changed clusters,
   * the new clusters
   * and the new list of labeled vectors
   */
  def kmeansStep(k: Int,
    s: Stat,
    clusters: ValuePipe[List[LabeledVector]],
    points: TypedPipe[LabeledVector]): Execution[(ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = {

    // Do a cross product to produce all point, cluster pairs
    // in scalding, the smaller pipe should go on the right.
    val next = points.leftCross(clusters)
      // now compute the closest cluster for each vector
      .map {
        case ((oldId, vector), Some(centroids)) =>
          val (id, newcentroid) = closest(vector, centroids)
          if (id != oldId) s.inc()
          (id, vector)
        case (_, None) => sys.error("Missing clusters, this should never happen")
      }
      .forceToDiskExecution

    // Now update the clusters:
    next.map { pipe =>
      (ComputedValue(pipe
        .group
        // There is no need to use more than k reducers
        .withReducers(k)
        .mapValueStream { vectors => Iterator(centroidOf(vectors)) }
        // Now collect them all into one big
        .groupAll
        .toList
        // discard the "all" key used to group them together
        .values), pipe)
    }
  }

  def initializeClusters(k: Int, points: TypedPipe[Vector[Double]]): (ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector]) = {
    val rng = new java.util.Random(123)
    // take a random k vectors:
    val clusters = points.map { v => (rng.nextDouble, v) }
      .groupAll
      .sortedTake(k)(Ordering.by(_._1))
      .mapValues { randk =>
        randk.iterator
          .zipWithIndex
          .map { case ((_, v), id) => (id, v) }
          .toList
      }
      .values

    // attach a random cluster to each vector
    val labeled = points.map { v => (rng.nextInt(k), v) }

    (ComputedValue(clusters), labeled)
  }

  /*
   * Run the full k-means algorithm by flatMapping the above function into itself
   * while the number of vectors that changed is not zero
   */
  def kmeans(k: Int,
    clusters: ValuePipe[List[LabeledVector]],
    points: TypedPipe[LabeledVector]): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = {

    val key = StatKey("changed", "scalding.kmeans")

    def go(s: Stat,
      c: ValuePipe[List[LabeledVector]],
      p: TypedPipe[LabeledVector],
      step: Int): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] =

      kmeansStep(k, s, c, p)
        .getAndResetCounters
        .flatMap {
          case ((nextC, nextP), counters) =>
            val changed = counters(key)
            if (changed == 0L) Execution.from((step, nextC, nextP))
            else go(s, nextC, nextP, step + 1)
        }

    Execution.withId { implicit uid =>
      go(Stat(key), clusters, points, 0)
    }
  }

  def apply(k: Int, points: TypedPipe[Vector[Double]]): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = {
    val (clusters, labeled) = initializeClusters(k, points)
    kmeans(k, clusters, labeled)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy