com.datastax.data.prepare.spark.rdd.RDDTransformation Maven / Gradle / Ivy
The newest version!
package com.datastax.data.prepare.spark.rdd;
import com.datastax.insight.spec.RDDOperator;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.data.prepare.util.Consts;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import java.util.ArrayList;
import java.util.List;
public class RDDTransformation implements RDDOperator {
@InsightComponent(name = "RDD 向量化", description = "向量化", icon = "arrows", order = 500206)
public static JavaRDD denseVector(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作数据") JavaRDD rdd,
@InsightComponentArg(name = "delimiter", description = "分隔符",defaultValue = ";") String delimiter){
JavaRDD vectors = rdd.map(
(Function) s -> {
String delim=delimiter;
if(delim==null || delim.length()==0) {
delim= Consts.DELIMITER;
}
String[] sarray = s.split(delim);
double[] values = new double[sarray.length];
for (int i = 0; i < sarray.length; i++) {
values[i] = Double.parseDouble(sarray[i]);
}
return Vectors.dense(values);
}
);
return vectors;
}
@InsightComponent(name = "RDD LabeledPoint化", description = "LabeledPoint化", icon = "arrows", order = 500207)
public static JavaRDD lpRDD(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作数据") JavaRDD rdd,
@InsightComponentArg(name = "delimiter", description = "分隔符",defaultValue = ";") String delimiter){
JavaRDD labeledPoints=rdd.map(
(Function) s -> {
String delim=delimiter;
if(delim==null || delim.length()==0) {
delim= Consts.DELIMITER;
}
String[] sarray = s.split(delim);
double[] values = new double[sarray.length];
for (int i = 1; i < sarray.length; i++) {
values[i] = Double.parseDouble(sarray[i]);
}
return new LabeledPoint(Double.parseDouble(sarray[0]), Vectors.dense(values));
}
);
return labeledPoints;
}
@InsightComponent(name = "RDD Split", description = "Split", icon = "square-o", order = 500502)
public static JavaRDD[] split(
@InsightComponentArg(externalInput = true,name = "rdd", description = "需要处理的数据RDD",defaultValue = "${output}._1") JavaRDD data,
@InsightComponentArg(name = "weights", description = "风格比例,用分号隔开") String weights){
String[] texts=weights.split(Consts.DELIMITER);
double[] ws=new double[texts.length];
for(int i=0;i JavaRDD distinct(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "numPartitions", description = "numPartitions",defaultValue = "0") int numPartitions){
if(numPartitions>0){
return rdd.distinct(numPartitions);
}else {
return rdd.distinct();
}
}
@InsightComponent(name = "RDD Coalesce", description = "Coalesce", icon = "arrows", order = 50010507)
public static JavaRDD coalesce(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "numPartitions", description = "numPartitions") int numPartitions,
@InsightComponentArg(name = "shuffle", description = "shuffle") boolean shuffle){
return rdd.coalesce(numPartitions,shuffle);
}
@InsightComponent(name = "RDD Repartition", description = "Repartition", icon = "arrows", order = 50010508)
public static JavaRDD repartition(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "numPartitions", description = "numPartitions") int numPartitions){
return rdd.repartition(numPartitions);
}
@InsightComponent(name = "RDD Sample", description = "Sample", icon = "arrows", order = 500101)
public static JavaRDD sample(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "withReplacement", description = "withReplacement") boolean withReplacement,
@InsightComponentArg(name = "fraction", description = "fraction") double fraction,
@InsightComponentArg(name = "seed", description = "seed") long seed){
return rdd.sample(withReplacement,fraction,seed);
}
@InsightComponent(name = "RDD Union", description = "Union", icon = "arrows", order = 50010506)
public static JavaRDD union(
@InsightComponentArg(externalInput = true,name = "left", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "right", description = "操作RDD") JavaRDD other){
return rdd.union(other);
}
@InsightComponent(name = "RDD Intersection", description = "Intersection", icon = "arrows", order = 50010509)
public static JavaRDD intersection(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "other", description = "操作RDD") JavaRDD other){
return rdd.intersection(other);
}
@InsightComponent(name = "RDD Subtract", description = "Subtract", icon = "arrows", order = 50010510)
public static JavaRDD subtract(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "other", description = "操作RDD") JavaRDD other,
@InsightComponentArg(name = "numPartitions", description = "numPartitions") int numPartitions){
return rdd.subtract(other,numPartitions);
}
@InsightComponent(name = "RDD 设置名称", description = "设置名称", icon = "arrows", order = 50010805)
public static JavaRDD setName(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "name", description = "名称") String name){
return rdd.setName(name);
}
@InsightComponent(name = "RDD Glom", description = "Glom", icon = "arrows", order = 500208)
public static JavaRDD> glom(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd){
return rdd.glom();
}
@InsightComponent(name = "RDD Pipe", description = "Pipe", icon = "arrows", order = 500210)
public static JavaRDD pipe(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "commands", description = "commands") String... command){
List list=new ArrayList<>();
for(String c : command){
list.add(c);
}
return rdd.pipe(list);
}
@InsightComponent(name = "RDD Zip", description = "Zip", icon = "arrows", order = 500209)
public static JavaPairRDD zip(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "type", description = "类型",items = "uniqueid;index") String type){
if(type.equals("uniqueid")){
return rdd.zipWithUniqueId();
}else if(type.equals("index")){
return rdd.zipWithIndex();
}
return null;
}
@InsightComponent(name = "RDD Collect", description = "Collect", icon = "arrows", order = 500211)
public static List collect(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作数据集") JavaRDD rdd){
return rdd.collect();
}
@InsightComponent(name = "RDD CollectPartitions", description = "CollectPartitions", icon = "arrows", order = 500212)
public static List[] collectPartitions(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "partitionIds", description = "partitionIds") String pid){
String[] pids=pid.split(Consts.DELIMITER);
int[] partitionIds=new int[pids.length];
for(int i=0;i Object count(
@InsightComponentArg(externalInput = true, name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(externalInput = true, name = "byValue", description = "byValue") boolean byValue,
@InsightComponentArg(name = "timeout", description = "近似计算的timeout",defaultValue = "0") long timeout,
@InsightComponentArg(name = "confidence", description = "confidence") double confidence){
if(byValue){
if(timeout>0){
return rdd.countByValueApprox(timeout,confidence);
}else {
return rdd.countByValue();
}
}else {
if(timeout>0){
return rdd.countApprox(timeout,confidence);
}else {
return rdd.count();
}
}
}
@InsightComponent(name = "RDD Take", description = "Take", icon = "arrows", order = 50010105)
public static List take(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "num", description = "num") int num){
return rdd.take(num);
}
@InsightComponent(name = "RDD TakeOrdered", description = "TakeOrdered", icon = "arrows", order = 50010106)
public static List takeOrdered(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "num", description = "num") int num){
return rdd.takeOrdered(num);
}
@InsightComponent(name = "RDD Top", description = "Top", icon = "arrows", order = 50010107)
public static List top(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "num", description = "num") int num){
return rdd.top(num);
}
@InsightComponent(name = "RDD TakeSample", description = "TakeSample", icon = "arrows", order = 500102)
public static List takeSample(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd,
@InsightComponentArg(name = "withReplacement", description = "withReplacement") boolean withReplacement,
@InsightComponentArg(name = "num", description = "num") int num,
@InsightComponentArg(name = "seed", description = "seed") long seed){
return rdd.takeSample(withReplacement,num,seed);
}
@InsightComponent(name = "RDD First", description = "First", icon = "arrows", order = 50010108)
public static T first(
@InsightComponentArg(externalInput = true,name = "rdd", description = "操作RDD") JavaRDD rdd){
return rdd.first();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy