ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.scala Maven / Gradle / Ivy
The newest version!
package ml.sparkling.graph.operators.partitioning
import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID}
import ml.sparkling.graph.operators.partitioning.CommunityBasedPartitioning.{ByComponentIdPartitionStrategy, logger}
import ml.sparkling.graph.operators.partitioning.PSCANBasedPartitioning.logger
import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId}
import scala.reflect.ClassTag
/**
* Created by Roman Bartusiak ([email protected] http://riomus.github.io).
* First approach to community based graph partitioning. It is not efficient due to need of gathering vertex to component id on driver node.
*/
object PropagationBasedPartitioning {
val logger=Logger.getLogger(PropagationBasedPartitioning.getClass())
def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],numParts:Int= -1,checkpointingFrequency:Int=10)(implicit sc:SparkContext): Graph[VD, ED] ={
val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts
var operationGraph=graph.mapVertices{
case (vId,_)=>vId
}
var oldComponents=operationGraph.vertices;
var numberOfComponents=graph.numVertices;
var oldNumberOfComponents=Long.MaxValue;
var iteration=0;
while ((numberOfComponents>numberOfPartitions && numberOfComponents!=1 && oldNumberOfComponents!=numberOfComponents) || oldNumberOfComponents>Int.MaxValue){
logger.info(s"Propagation based partitioning: iteration:$iteration, last number of components:$oldNumberOfComponents, current number of components:$numberOfComponents")
iteration=iteration+1;
oldComponents=operationGraph.vertices.cache();
val newIds=operationGraph.aggregateMessages[VertexId](ctx=>{
if(ctx.srcAttrnewData.getOrElse(oldData)
}.cache()
oldNumberOfComponents=numberOfComponents
numberOfComponents=operationGraph.vertices.map(_._2).distinct().count()
if(iteration%checkpointingFrequency==0){
oldComponents.checkpoint();
operationGraph.checkpoint();
operationGraph.vertices.foreachPartition((_)=>{})
operationGraph.edges.foreachPartition((_)=>{})
oldComponents.foreachPartition((_)=>{})
}
}
val (communities,numberOfCommunities)=(oldComponents,oldNumberOfComponents)
val vertexToCommunityId: Map[VertexId, ComponentID] = communities.treeAggregate(Map[VertexId,VertexId]())((agg,data)=>{agg+(data._1->data._2)},(agg1,agg2)=>agg1++agg2)
val (vertexMap,newNumberOfCummunities)=PartitioningUtils.coarsePartitions(numberOfPartitions, numberOfCommunities, vertexToCommunityId)
val strategy=ByComponentIdPartitionStrategy(sc.broadcast(vertexMap))
logger.info(s"Partitioning graph using coarsed map with ${vertexMap.size} entries (${vertexToCommunityId.size} before coarse) and ${numberOfCommunities} partitions")
graph.partitionBy(strategy,newNumberOfCummunities.toInt)
}
}