Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package com.datastax.data.prepare.spark.dataset.louvain
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx._
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, LongType}
import scala.reflect.ClassTag
class Louvain2 extends Serializable {
def createLouvainGraph[VD: ClassTag](graph: Graph[VD, Double]): Graph[LouvainData2, Double] = {
val nodeWeights = graph.aggregateMessages(
(e:EdgeContext[VD, Double, Double]) => {
e.sendToSrc(e.attr)
e.sendToDst(e.attr)
},
(e1: Double, e2: Double) => e1 + e2
)
graph.outerJoinVertices(nodeWeights)((vid, _, weightOption) => {
val weight = weightOption.getOrElse(0.0)
new LouvainData2(vid, weight, 0.0, weight, false)
}).partitionBy(PartitionStrategy.EdgePartition2D).groupEdges(_ + _)
}
def sendCommunityData(e: EdgeContext[LouvainData2, Double, Map[(Long, Double), Double]]) = {
val m1 = Map((e.srcAttr.community, e.srcAttr.communitySigmaTot) -> e.attr)
val m2 = Map((e.dstAttr.community, e.dstAttr.communitySigmaTot) -> e.attr)
e.sendToSrc(m2)
e.sendToDst(m1)
}
def mergeCommunityMessages(m1: Map[(Long, Double), Double], m2: Map[(Long, Double), Double]) = {
val newMap = scala.collection.mutable.HashMap[(Long, Double), Double]()
m1.foreach({ case (k, v) =>
if (newMap.contains(k)) newMap(k) = newMap(k) + v
else newMap(k) = v
})
m2.foreach({ case (k, v) =>
if (newMap.contains(k)) newMap(k) = newMap(k) + v
else newMap(k) = v
})
newMap.toMap
}
def q(currCommunityId: Long, testCommunityId: Long, testSigmaTot: Double,
edgeWeightInCommunity: Double, nodeWeight: Double, internalWeight: Double,
totalEdgeWeight: Double): BigDecimal = {
val isCurrentCommunity = currCommunityId.equals(testCommunityId)
val M = BigDecimal(totalEdgeWeight)
val k_i_in_L = if (isCurrentCommunity) edgeWeightInCommunity + internalWeight else edgeWeightInCommunity
val k_i_in = BigDecimal(k_i_in_L)
val k_i = BigDecimal(nodeWeight + internalWeight)
val sigma_tot = if (isCurrentCommunity) BigDecimal(testSigmaTot) - k_i else BigDecimal(testSigmaTot)
var deltaQ = BigDecimal(0.0)
if (!(isCurrentCommunity && sigma_tot.equals(BigDecimal.valueOf(0.0)))) {
deltaQ = k_i_in - (k_i * sigma_tot / M)
//println(s" $deltaQ = $k_i_in - ( $k_i * $sigma_tot / $M")
}
deltaQ
}
def louvainVertJoin(louvainGraph: Graph[LouvainData2, Double], msgRDD: VertexRDD[Map[(Long, Double), Double]],
totalEdgeWeight: Broadcast[Double], even: Boolean) = {
// innerJoin[U, VD2](other: RDD[(VertexId, U)])(f: (VertexId, VD, U) => VD2): VertexRDD[VD2]
louvainGraph.vertices.innerJoin(msgRDD)((vid, louvainData, communityMessages) => {
var bestCommunity = louvainData.community
val startingCommunityId = bestCommunity
var maxDeltaQ = BigDecimal(0.0)
var bestSigmaTot: Double = 0.0
// VertexRDD[scala.collection.immutable.Map[(Long, Long),Long]]
// e.g. (1,Map((3,10) -> 2, (6,4) -> 2, (2,8) -> 2, (4,8) -> 2, (5,8) -> 2))
// e.g. communityId:3, sigmaTotal:10, communityEdgeWeight:2
communityMessages.foreach({ case ((communityId, sigmaTotal), communityEdgeWeight) =>
val deltaQ = q(
startingCommunityId,
communityId,
sigmaTotal,
communityEdgeWeight,
louvainData.nodeWeight,
louvainData.internalWeight,
totalEdgeWeight.value)
//println(" communtiy: "+communityId+" sigma:"+sigmaTotal+"
//edgeweight:"+communityEdgeWeight+" q:"+deltaQ)
if (deltaQ > maxDeltaQ || (deltaQ > 0 && (deltaQ == maxDeltaQ && communityId > bestCommunity))) {
maxDeltaQ = deltaQ
bestCommunity = communityId
bestSigmaTot = sigmaTotal
}
})
// only allow changes from low to high communties on even cyces and
// high to low on odd cycles
if (louvainData.community != bestCommunity && ((even &&
louvainData.community > bestCommunity) || (!even &&
louvainData.community < bestCommunity))) {
//println(" "+vid+" SWITCHED from "+vdata.community+" to "+bestCommunity)
louvainData.community = bestCommunity
louvainData.communitySigmaTot = bestSigmaTot
louvainData.changed = true
}
else {
louvainData.changed = false
}
if (louvainData == null)
println("vdata is null: " + vid)
louvainData
})
}
def louvain(sc: SparkContext, graph: Graph[LouvainData2, Double], minProgress: Int = 1,
progressCounter: Int = 1): (Double, Graph[LouvainData2, Double], Int) = {
var louvainGraph = graph //.cache()
//取得 2m
val graphWeight = louvainGraph.vertices.map(louvainVertex => {
val (vertexId, louvainData) = louvainVertex
louvainData.internalWeight + louvainData.nodeWeight
}).reduce(_ + _)
val totalGraphWeight = sc.broadcast(graphWeight)
println("totalEdgeWeight: " + totalGraphWeight.value)
// gather community information from each vertex's local neighborhood
// 取得节点的邻居节点
var communityRDD =
louvainGraph.aggregateMessages(sendCommunityData, mergeCommunityMessages)
var activeMessages = communityRDD.count() //materializes the msgRDD
//and caches it in memory
var updated = 0L - minProgress
var even = false
var count = 0
val maxIter = 100000
var stop = 0
var updatedLastPhase = 0L
do {
count += 1
even = !even
// label each vertex with its best community based on neighboring
// community information
val labeledVertices = louvainVertJoin(louvainGraph, communityRDD,
totalGraphWeight, even).cache()
// calculate new sigma total value for each community (total weight
// of each community)
val communityUpdate = labeledVertices
.map({ case (vid, vdata) => (vdata.community, vdata.nodeWeight + vdata.internalWeight)})
.reduceByKey(_ + _).cache()
// map each vertex ID to its updated community information
val communityMapping = labeledVertices
.map({ case (vid, vdata) => (vdata.community, vid)})
.join(communityUpdate)
.map({ case (community, (vid, sigmaTot)) => (vid, (community, sigmaTot))})
.cache()
// join the community labeled vertices with the updated community info
val updatedVertices = labeledVertices.join(communityMapping)
.map({ case (vertexId, (louvainData, communityTuple)) =>
val (community, communitySigmaTot) = communityTuple
louvainData.community = community
louvainData.communitySigmaTot = communitySigmaTot
(vertexId, louvainData)
}).cache()
updatedVertices.count()
labeledVertices.unpersist(blocking = false)
communityUpdate.unpersist(blocking = false)
communityMapping.unpersist(blocking = false)
val prevG = louvainGraph
//更新louvainGraph的vertices 的LouvainData 信息
louvainGraph = louvainGraph.outerJoinVertices(updatedVertices)((vid, old, newOpt) => newOpt.getOrElse(old))
// louvainGraph.cache()
// gather community information from each vertex's local neighborhood
val oldMsgs = communityRDD
communityRDD = louvainGraph.aggregateMessages(sendCommunityData, mergeCommunityMessages).cache()
activeMessages = communityRDD.count() // materializes the graph
// by forcing computation
oldMsgs.unpersist(blocking = false)
updatedVertices.unpersist(blocking = false)
prevG.unpersistVertices(blocking = false)
// half of the communites can swtich on even cycles and the other half
// on odd cycles (to prevent deadlocks) so we only want to look for
// progess on odd cycles (after all vertcies have had a chance to
// move)
if (even) updated = 0
updated = updated + louvainGraph.vertices.filter(_._2.changed).count
if (!even) {
println(" # vertices moved: " + java.text.NumberFormat.getInstance().format(updated))
if (updated >= updatedLastPhase - minProgress) stop += 1
updatedLastPhase = updated
}
} while (stop <= progressCounter && (even || (updated > 0 && count < maxIter)))
println("\nCompleted in " + count + " cycles")
// Use each vertex's neighboring community data to calculate the
// global modularity of the graph
val newVertices =
louvainGraph.vertices.innerJoin(communityRDD)((vertexId, louvainData,
communityMap) => {
// sum the nodes internal weight and all of its edges that are in
// its community
val community = louvainData.community
var accumulatedInternalWeight = louvainData.internalWeight
val sigmaTot = louvainData.communitySigmaTot.toDouble
def accumulateTotalWeight(totalWeight: Double, item: ((Long, Double), Double)) = {
val ((communityId, sigmaTotal), communityEdgeWeight) = item
if (louvainData.community == communityId)
totalWeight + communityEdgeWeight
else
totalWeight
}
accumulatedInternalWeight = communityMap.foldLeft(accumulatedInternalWeight)(accumulateTotalWeight)
val M = totalGraphWeight.value
val k_i = louvainData.nodeWeight + louvainData.internalWeight
val q = (accumulatedInternalWeight.toDouble / M) - ((sigmaTot * k_i) / math.pow(M, 2))
//println(s"vid: $vid community: $community $q = ($k_i_in / $M) - ( ($sigmaTot * $k_i) / math.pow($M, 2) )")
if (q < 0)
0
else
q
})
val actualQ = newVertices.values.reduce(_ + _)
// return the modularity value of the graph along with the
// graph. vertices are labeled with their community
(actualQ, louvainGraph, count / 2)
}
def compressGraph(graph: Graph[LouvainData2, Double], debug: Boolean = true): Graph[LouvainData2, Double] = {
// aggregate the edge weights of self loops. edges with both src and dst in the same community.
// WARNING can not use graph.mapReduceTriplets because we are mapping to new vertexIds
val internalEdgeWeights = graph.triplets.flatMap(et => {
if (et.srcAttr.community == et.dstAttr.community) {
Iterator((et.srcAttr.community, 2 * et.attr)) // count the weight from both nodes
}
else Iterator.empty
}).reduceByKey(_ + _)
// aggregate the internal weights of all nodes in each community
val internalWeights = graph.vertices.values.map(vdata =>
(vdata.community, vdata.internalWeight))
.reduceByKey(_ + _)
// join internal weights and self edges to find new interal weight of each community
val newVertices = internalWeights.leftOuterJoin(internalEdgeWeights).map({ case (vid, (weight1, weight2Option)) =>
val weight2 = weight2Option.getOrElse(0.0)
val state = new LouvainData2()
state.community = vid
state.changed = false
state.communitySigmaTot = 0L
state.internalWeight = weight1 + weight2
state.nodeWeight = 0L
(vid, state)
}).cache()
// translate each vertex edge to a community edge
val edges = graph.triplets.flatMap(et => {
val src = math.min(et.srcAttr.community, et.dstAttr.community)
val dst = math.max(et.srcAttr.community, et.dstAttr.community)
if (src != dst) Iterator(new Edge(src, dst, et.attr))
else Iterator.empty
}).cache()
// generate a new graph where each community of the previous graph is
// now represented as a single vertex
val compressedGraph = Graph(newVertices, edges)
.partitionBy(PartitionStrategy.EdgePartition2D).groupEdges(_ + _)
// calculate the weighted degree of each node
val nodeWeights = compressedGraph.aggregateMessages(
(e:EdgeContext[LouvainData2, Double, Double]) => {
e.sendToSrc(e.attr)
e.sendToDst(e.attr)
},
(e1: Double, e2: Double) => e1 + e2
)
// fill in the weighted degree of each node
// val louvainGraph = compressedGraph.joinVertices(nodeWeights)((vid,data,weight)=> {
val louvainGraph = compressedGraph.outerJoinVertices(nodeWeights)((vid, data, weightOption) => {
val weight = weightOption.getOrElse(0.0)
data.communitySigmaTot = weight + data.internalWeight
data.nodeWeight = weight
data
}) //.cache()
louvainGraph.vertices.count()
louvainGraph.triplets.count() // materialize the graph
newVertices.unpersist(blocking = false)
edges.unpersist(blocking = false)
louvainGraph
}
def process(spark: SparkSession, data: DataFrame, node1: String, node2: String, edgeAttr: String,
minimumCompressionProgress: Int = 2000, progressCounter: Int = 20): DataFrame = {
import spark.implicits._
//string to index
val targetData = data.select(col(node1), col(node2), col(edgeAttr).cast(DoubleType))
val allNodes = targetData.select(col(node1)).union(targetData.select(col(node2))).distinct().toDF("node")
val indexer = new StringIndexer()
.setInputCol("node")
.setOutputCol("nodeIndex")
val nodeIndexed = indexer.fit(allNodes).transform(allNodes)
nodeIndexed.cache()
val changedData = targetData.join(nodeIndexed, targetData(node1).equalTo(nodeIndexed("node")), "left")
.select(col(node2), col("nodeIndex").as(node1), col(edgeAttr))
.join(nodeIndexed, targetData(node2).equalTo(nodeIndexed("node")), "left")
.select(col(node1).cast(LongType), col("nodeIndex").as(node2).cast(LongType), col(edgeAttr)).as[(Long, Long, Double)]
nodeIndexed.unpersist()
//create graph
val edgeRDD = changedData.map(t => new Edge(t._1, t._2, t._3)).rdd
val initialGraph = Graph.fromEdges(edgeRDD, None)
var louvainGraph = createLouvainGraph(initialGraph)
var compressionLevel = -1 // number of times the graph has been compressed
var q_modularityValue = -1.0 // current modularity value
var halt = false
var qValues: Array[(Int, Double)] = Array()
var t: RDD[(Long, Long)] = null
do {
compressionLevel += 1
println(s"\nStarting Louvain level $compressionLevel")
// label each vertex with its best community choice at this level of compression
val (currentQModularityValue, currentGraph, numberOfPasses) =
louvain(spark.sparkContext, louvainGraph, minimumCompressionProgress, progressCounter)
louvainGraph.unpersistVertices(blocking = false)
louvainGraph = currentGraph
println(s"qValue: $currentQModularityValue")
qValues = qValues :+ ((compressionLevel, currentQModularityValue))
t = if(t == null) {
louvainGraph.vertices.map{case (vid, louvainData) => (louvainData.community, vid.toLong)}
} else {
louvainGraph.vertices.map{case (vid, louvainData) => (vid.toLong, louvainData.community)}
.join(t).map(t => (t._2._2, t._2._1))
}
// If modularity was increased by at least 0.001 compress the graph and repeat
// halt immediately if the community labeling took less than 3 passes
//println(s"if ($passes > 2 && $currentQ > $q + 0.001 )")
if (numberOfPasses > 2 && currentQModularityValue > q_modularityValue + 0.001) {
q_modularityValue = currentQModularityValue
louvainGraph = compressGraph(louvainGraph)
}
else {
halt = true
}
} while (!halt)
//finalSave(sc, compressionLevel, q_modularityValue, louvainGraph)
// t.take(100).foreach(println)
val df1 = spark.createDataset(t).toDF("x", "y")
val df2 = df1.join(nodeIndexed, df1("x").equalTo(nodeIndexed("nodeIndex")), "left")
.select(col("y"), col("node"))
df2.groupBy("y").agg(collect_list(col("node")).as("groups")).select(concat_ws(";", col("groups")).as("groups"))
}
}