date.iterator.count.test.LRPredictTest Maven / Gradle / Ivy
package date.iterator.count.test;
import date.iterator.count.isodata.Cluster;
import date.iterator.count.isodata.ISOData;
import date.iterator.count.isodata.Point;
import date.iterator.count.util.CalculationUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.List;
public class LRPredictTest {
private static int expectK = 15; // 预期的聚类中心数目;
private static int totalLoopI = 20; // 迭代运算的次数。
private static int theta_S = 3; //θS 一个类中样本距离分布的标准差阈值。类内最大标准差分量应小于 θs
private static int theta_c = 1; //θc 两个聚类中心间的最小距离,若小于此数,两个聚类需进行合并;
private static int pointSize = 5000;
private static int initK = 5;
public static void main(final String[] args) {
List points = CalculationUtil.testPoints(pointSize);
ISOData isoData = new ISOData(expectK, totalLoopI, theta_S, theta_c, initK, points);
List clusters = isoData.calculate();
System.out.println(clusters.size());
}
private static void getData(){
SparkSession sparkSession = SparkSession.builder()
.appName("aaa")
.master("local[*]")
.config("spark.local.dir", "D:\\tmp")
.config("hive.metastore.uris", "thrift://10.0.2.130:9083")
.config("num-executors", "35")
.config("spark.dynamicAllocation.enabled", "true")
.config("spark.dynamicAllocation.maxExecutors", "35")
.config("spark.shuffle.service.enabled", "true")
.config("spark.sql.parquet.writeLegacyFormat", "true")
.config("spark.sql.crossJoin.enabled", "true")
.config("spark.yarn.executor.memoryOverhead", "4g")
.config("spark.executor.memory", "4g")
//.config("spark.default.parallelism", "120")
.enableHiveSupport()
.getOrCreate();
}
private static Dataset queryTable(final SparkSession sparkSession, final String classId) {
String table = "data_set.collect_" + classId + "_set";
String sql = "SELECT * FROM " + table + " where order_count > 0";
Dataset data = sparkSession.sql(sql);
return data;
}
}