All Downloads are FREE. Search and download functionalities are using the official Maven repository.

spark.examples.LocalKMeans.scala Maven / Gradle / Ivy

package spark.examples

import java.util.Random
import spark.util.Vector
import spark.SparkContext._
import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

/**
 * K-means clustering.
 */
object LocalKMeans {
	val N = 1000
	val R = 1000   	// Scaling factor
	val D = 10
	val K = 10
	val convergeDist = 0.001
	val rand = new Random(42)
  	
	def generateData = {
	    def generatePoint(i: Int) = {
	      Vector(D, _ => rand.nextDouble * R)
	    }
	    Array.tabulate(N)(generatePoint)
	  }
	
	def closestPoint(p: Vector, centers: HashMap[Int, Vector]): Int = {
		var index = 0
		var bestIndex = 0
		var closest = Double.PositiveInfinity
	
		for (i <- 1 to centers.size) {
			val vCurr = centers.get(i).get
			val tempDist = p.squaredDist(vCurr)
			if (tempDist < closest) {
				closest = tempDist
				bestIndex = i
			}
		}
	
		return bestIndex
	}

	def main(args: Array[String]) {
	  val data = generateData
		var points = new HashSet[Vector]
		var kPoints = new HashMap[Int, Vector]
		var tempDist = 1.0
		
		while (points.size < K) {
			points.add(data(rand.nextInt(N)))
		}
		
		val iter = points.iterator
		for (i <- 1 to points.size) {
			kPoints.put(i, iter.next())
		}

		println("Initial centers: " + kPoints)

		while(tempDist > convergeDist) {
			var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
			
			var mappings = closest.groupBy[Int] (x => x._1)
			
			var pointStats = mappings.map(pair => pair._2.reduceLeft [(Int, (Vector, Int))] {case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1+y2))})
			
			var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1/mapping._2._2)}
			
			tempDist = 0.0
			for (mapping <- newPoints) {
				tempDist += kPoints.get(mapping._1).get.squaredDist(mapping._2)
			}
			
			for (newP <- newPoints) {
				kPoints.put(newP._1, newP._2)
			}
		}

		println("Final centers: " + kPoints)
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy