com.datastax.data.prepare.spark.dataset.hierarchicalCluster.Pink Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset.hierarchicalCluster;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.data.prepare.spark.dataset.GraphXBFS;
import com.datastax.data.prepare.spark.dataset.hierarchicalCluster.algorithm.PinkMSTEdge;
import com.datastax.data.prepare.spark.dataset.hierarchicalCluster.algorithm.UnionFind;
import com.datastax.data.prepare.spark.dataset.hierarchicalCluster.entry.Edge;
import com.datastax.data.prepare.spark.dataset.hierarchicalCluster.writable.EdgeWritable;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.ivy.util.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.*;
public class Pink {
public static Dataset runCLuster(Dataset dataset,int numDataSplits,int K,double threshold,String tmpFileDir) {
String idPartitionFilesLoc = null;
String dataParitionFilesLoc = null;
String HOME = tmpFileDir.trim();
if ("/".equals(HOME.substring(HOME.length()-1))) {
idPartitionFilesLoc = HOME + "subgraphIds";
dataParitionFilesLoc = HOME + "dataPartitions";
} else {
idPartitionFilesLoc = HOME + "/subgraphIds";
dataParitionFilesLoc = HOME + "/dataPartitions";
}
DataSplitter splitter = new DataSplitter(dataset);
int numPonits = splitter.getNumPonits();
System.out.println("numPonits is: " + numPonits);
Map idUserMap = splitter.getIdUserMap();
JavaSparkContext sc = splitter.getSparkContext();
hdfsFileDeal(dataParitionFilesLoc,idPartitionFilesLoc);
splitter.saveHadoopFileEdge(dataset,numDataSplits,dataParitionFilesLoc);
splitter.createPartitionFiles(idPartitionFilesLoc,numDataSplits);
JavaRDD partitionRDD = sc.textFile(idPartitionFilesLoc, numDataSplits);
//读取
JavaPairRDD partitions = partitionRDD.flatMapToPair(new Pink.GetPartitionFunction(dataParitionFilesLoc));
JavaPairRDD> mstToBeMerged = partitions.combineByKey(new Pink.CreateCombiner(),
new Pink.Merger(), new Pink.KruskalReducer(numPonits));
JavaPairRDD> mstToBeMergedResult = null;
while (numDataSplits > 1) {
numDataSplits = (numDataSplits + (K - 1)) / K;
mstToBeMergedResult = mstToBeMerged.mapToPair(new Pink.SetPartitionIdFunction(K))
.reduceByKey(new Pink.KruskalReducer(numPonits), numDataSplits);
mstToBeMerged = mstToBeMergedResult;
System.out.println("time: " + new Date().toString() + " numDataSplits: " + numDataSplits);
}
System.out.println(new Date().toString() + " Start cluster data!");
List result = filterThreshold(sc,mstToBeMerged.collect().get(0)._2,threshold);
System.out.println(new Date().toString() + " Stop cluster data!");
hdfsFileDeal(dataParitionFilesLoc,idPartitionFilesLoc);
return createDataset(result,idUserMap);
}
private static Dataset createDataset(List result,Map idUserMap) {
List> listResult = new ArrayList<>();
for(String claStr:result) {
List clas = new ArrayList<>();
for(String user: claStr.split(";")) {
clas.add(idUserMap.get(Integer.parseInt(user)).toString());
}
listResult.add(clas);
}
listResult.sort(Comparator.comparing(List::size).reversed());
List rows = new ArrayList<>();
int id = 1;
for (List list:listResult) {
rows.add(RowFactory.create("clu-" + id, StringUtils.join(list.toArray(),";")));
id++;
}
List structFields = new ArrayList<>();
structFields.add(DataTypes.createStructField( "cluster-id", DataTypes.StringType, true ));
structFields.add(DataTypes.createStructField( "cluster", DataTypes.StringType, true ));
StructType structType = DataTypes.createStructType( structFields );
return (Dataset)SparkContextBuilder.getSession().createDataFrame(rows, structType);
}
private static List filterThreshold (JavaSparkContext sc, Iterable iterable, double threshold) {
List listRemain = new ArrayList<>();
for (Edge edge : iterable) {
if (edge.getWeight() < -threshold ) {
listRemain.add(edge.getLeft() + "," + edge.getRight());
}
}
return GraphXBFS.devideCluster(sc,listRemain);
}
private static class KruskalReducer implements Function2, Iterable, Iterable> {
private static final long serialVersionUID = 1L;
private transient UnionFind uf = null;
private final Integer numPoints;
public KruskalReducer(int numPoints) {
this.numPoints = numPoints;
}
//排序
@Override
public Iterable call(Iterable leftEdges, Iterable rightEdges) throws Exception {
uf = new UnionFind(numPoints);
List edges = Lists.newArrayList();
Iterator leftEdgesIterator = leftEdges.iterator();
Iterator rightEdgesIterator = rightEdges.iterator();
Edge leftEdge = leftEdgesIterator.next();
Edge rightEdge = rightEdgesIterator.next();
Edge minEdge;
boolean isLeft;
Iterator minEdgeIterator;
final int numEdges = numPoints - 1;
do {
if (leftEdge.getWeight() < rightEdge.getWeight()) {
minEdgeIterator = leftEdgesIterator;
minEdge = leftEdge;
isLeft = true;
} else {
minEdgeIterator = rightEdgesIterator;
minEdge = rightEdge;
isLeft = false;
}
if (uf.unify(minEdge.getLeft(), minEdge.getRight())) {
edges.add(minEdge);
}
minEdge = minEdgeIterator.hasNext() ? minEdgeIterator.next() : null;
if (isLeft) {
leftEdge = minEdge;
} else {
rightEdge = minEdge;
}
} while (minEdge != null && edges.size() < numEdges);
minEdge = isLeft ? rightEdge : leftEdge;
minEdgeIterator = isLeft ? rightEdgesIterator : leftEdgesIterator;
while (edges.size() < numEdges && minEdgeIterator.hasNext()) {
if (uf.unify(minEdge.getLeft(), minEdge.getRight())) {
edges.add(minEdge);
}
minEdge = minEdgeIterator.next();
}
return edges;
}
}
private static class CreateCombiner implements Function> {
private static final long serialVersionUID = 1L;
@Override
public Iterable call(Edge edge) throws Exception {
List edgeList = Lists.newArrayListWithCapacity(1);
edgeList.add(edge);
return edgeList;
}
}
private static class Merger implements Function2, Edge, Iterable> {
private static final long serialVersionUID = 1L;
@Override
public Iterable call(Iterable edges, Edge edge) throws Exception {
List mergeList = Lists.newArrayList(edges);
mergeList.add(edge);
return mergeList;
}
}
private static class SetPartitionIdFunction implements PairFunction>, Integer, Iterable> {
private static final long serialVersionUID = 1L;
private final Integer K;
SetPartitionIdFunction(int K) {
this.K = K;
}
@Override
public Tuple2> call(Tuple2> integerIterableTuple2) throws Exception {
Integer key = integerIterableTuple2._1 / K;
return new Tuple2>(key, integerIterableTuple2._2);
}
}
public static final class GetPartitionFunction implements PairFlatMapFunction {
private static final long serialVersionUID = 1L;
private final String inputFile;
GetPartitionFunction(String inputFile) {
this.inputFile = inputFile;
}
@Override
public Iterator> call(String row) throws Exception {
final Integer partitionId = Integer.parseInt(row);
// final String inputDataFilesLoc = "hdfs://master:8020/dataexa/insight/poc/result/dataPartitions";
List subGraphEdge = getSubGraphEdge(partitionId, inputFile);
PinkMSTEdge pinkMSTEdge = new PinkMSTEdge(subGraphEdge,partitionId);
List> mining = pinkMSTEdge.mining();
return mining.iterator();
}
}
private static List openFileEdge(String fileName) throws Exception {
final Configuration conf = new Configuration();
org.apache.hadoop.io.SequenceFile.Reader.Option fileOpt = SequenceFile.Reader.file(new Path(fileName));
SequenceFile.Reader reader = new SequenceFile.Reader(conf, fileOpt);
NullWritable key = NullWritable.get();
EdgeWritable value = new EdgeWritable();
List list = Lists.newArrayList();
while (reader.next(key, value)) {
list.add(value.clone());
}
reader.close();
return list;
}
private static List getSubGraphEdge(int partitionId, String inputDataFilesLoc) throws Exception {
String leftFileName = String.format("%s/part-%05d", inputDataFilesLoc, partitionId);
List edgesLeft = openFileEdge(leftFileName);
return edgesLeft;
}
private static void hdfsFileDeal(String... files) {
for(String file :files) {
DataSplitter.deleteHdfsFile(file);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy