Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.datastax.data.prepare.spark.dataset.BasicOperator Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.data.prepare.util.*;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.spec.Operator;
import com.datastax.data.prepare.spark.dataset.params.Aggregate;
import com.google.common.base.Strings;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import org.apache.spark.sql.types.StructField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import static org.apache.spark.sql.functions.*;
import org.apache.spark.sql.functions.*;
public class BasicOperator implements Operator {
private static final Logger logger = LoggerFactory.getLogger(BasicOperator.class);
@InsightComponent( name = "generateID", type = "com.datastax.insight.dataprprocess.generateID", description = "ID生成", order = 500801 )
public static Dataset generateID(
@InsightComponentArg(externalInput = true, name = "data", description = "数据集") Dataset data,
@InsightComponentArg(name = "IDName", description = "ID属性名称", request = true) String IDName,
@InsightComponentArg(name = "IDType", description = "ID类型", request = true, items = "UUID32;Increment", defaultValue = "UUID32") String IDType) {
if( Strings.isNullOrEmpty(IDName) || data == null){
logger.info("IDName is empty");
return data;
}
if( Consts.INCREMENT.equals( IDType ) ){
return (Dataset) data.withColumn(IDName, functions.lit(functions.monotonically_increasing_id()));
} else {
SparkSession sparkSession = SparkContextBuilder.getSession();
SharedUDFs.uuid(sparkSession.udf());
return (Dataset) data.withColumn(IDName, functions.callUDF("uuid", data.col(data.columns()[0])));
}
}
// @Deprecated
@InsightComponent( name = "重命名", type = "com.datastax.insight.dataprprocess.rename", description = "重命名", order = 500107 )
public static Dataset rename(
@InsightComponentArg(externalInput = true,name = "dataset",description = "数据集") Dataset dataset,
@InsightComponentArg(name = "oldName",description = "属性原来的名字",request = true) String oldName,
@InsightComponentArg(name = "newName",description = "属性的新名字",request = true) String newName) {
if( Strings.isNullOrEmpty(oldName) || Strings.isNullOrEmpty(newName) || dataset == null) {
logger.info("oldName or newname is empty!");
return dataset;
}
String[] oldNames = SharedMethods.handleColsWithEmpty(oldName.split(Consts.DELIMITER));
String[] newNames = SharedMethods.handleColsWithEmpty(newName.split(Consts.DELIMITER));
//判断切割后oldname和newname的个数是否一样
if (oldNames.length != newNames.length) {
logger.info("The number of newname is not the same as the number of oldname");
return dataset;
}
for (String aNewName : newNames) {
if (Strings.isNullOrEmpty(aNewName)) {
logger.info("newName is empty!");
return dataset;
}
}
//进行重命名
for (int i = 0; i < newNames.length; i++) {
dataset = (Dataset) dataset.withColumnRenamed(oldNames[i], newNames[i]);
}
return dataset;
}
@InsightComponent(name = "命名", description = "命名组件, 包含重命名和初始化命名")
public static Dataset name(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "参数", description = "命名参数") JSONObject object) {
if(object.isEmpty()) {
logger.info("命名组件的参数为空,返回原数据集");
return data;
}
if(data == null) {
logger.info("命名组件参数中的数据集为空,返回null");
return null;
}
String type = object.getString("selector");
StructField[] fields = data.schema().fields();
if(Consts.INITIALIZA_NAME.equals(type)) {
String initialType = object.getString("selectorValue");
String value = object.getString("method").trim();
if(value.length() == 0) {
logger.info("命名组件的初始化命名的value为空, 默认设为列名自动生成, 前缀为_c");
value = "_c";
initialType = Consts.AUTO;
}
if(Consts.AUTO.equals(initialType)) {
String[] columns = new String[fields.length];
for(int i=0; i) data.toDF(columns);
}
if(Consts.MANUAL.equals(initialType)) {
String[] columns = SharedMethods.handleColsWithEmpty(value.split(Consts.DELIMITER));
if(fields.length != columns.length) {
throw new CustomException("初始化命名的列名个数和数据集的列名个数不相等");
}
return (Dataset) data.toDF(columns);
}
}
if(Consts.RENAME.equals(type)) {
JSONArray array = object.getJSONArray("selectorValue");
for(int i=0; i) data.withColumnRenamed(oldName, newName);
}
return data;
}
return null;
}
@InsightComponent( name = "聚合", description = "聚合" )
public static Dataset aggregate(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "分组类型", description = "分组类型", defaultValue = "groupBy", items = "groupBy;rollup;cube") String type,
@InsightComponentArg(name = "分组列名", description = "分组的列名,用;隔开") String columns,
@InsightComponentArg(name = "聚合函数类型", description = "聚合函数类型", defaultValue = "sum", items = "min;max;avg;sum;count;collect_list;collect_set;distinct_count;distinct_sum;approx_count_distinct") String funcType,
@InsightComponentArg(name = "聚合列名", description = "聚合的列名,用;隔开") String funcColumns) {
Aggregate aggregate = new Aggregate(type, columns, funcType, funcColumns);
return aggregate(data, aggregate);
}
protected static Dataset aggregate(Dataset data, Aggregate aggregate) {
if(aggregate.getFuncColumns() == null || aggregate.getFuncColumns().length() == 0) {
logger.info("聚合选中的列名数组为空,返回原数据集");
return data;
}
if(aggregate.getType() == null || aggregate.getType().length() == 0) {
aggregate.setType(Consts.GROUPBY);
}
if(aggregate.getFuncType() == null || aggregate.getFuncType().length() == 0) {
aggregate.setFuncType(Consts.SUM);
}
boolean groupFlag = Strings.isNullOrEmpty(aggregate.getColumns());
Column[] groupColumns = aggFunc(null, SharedMethods.handleColsWithEmpty(aggregate.getColumns().split(Consts.DELIMITER)));
Column[] aggColumns = aggFunc(aggregate.getFuncType(), SharedMethods.handleColsWithEmpty(aggregate.getFuncColumns().split(Consts.DELIMITER)));
boolean aggFlag = aggColumns.length == 1;
Column[] copy = null;
if(!aggFlag) {
copy = new Column[aggColumns.length-1];
if(aggColumns.length == 0) {
throw new NullPointerException("addColumns为空");
}
System.arraycopy(aggColumns, 1, copy, 0, aggColumns.length-1);
}
if(Consts.GROUPBY.equals(aggregate.getType())) {
data = (Dataset) (groupFlag ? aggFlag ? data.groupBy().agg(aggColumns[0]) : data.groupBy().agg(aggColumns[0], copy) :
aggFlag ? data.groupBy(groupColumns).agg(aggColumns[0]) : data.groupBy(groupColumns).agg(aggColumns[0], copy));
}
if(Consts.ROLLUP.equals(aggregate.getType())) {
data = (Dataset) (groupFlag ? aggFlag ? data.rollup().agg(aggColumns[0]) : data.rollup().agg(aggColumns[0], copy) :
aggFlag ? data.rollup(groupColumns).agg(aggColumns[0]) : data.rollup(groupColumns).agg(aggColumns[0], copy));
}
if(Consts.CUBE.equals(aggregate.getType())) {
data = (Dataset) (groupFlag ? aggFlag ? data.cube().agg(aggColumns[0]) : data.cube().agg(aggColumns[0], copy) :
aggFlag ? data.cube(groupColumns).agg(aggColumns[0]) : data.cube(groupColumns).agg(aggColumns[0], copy));
}
return data;
}
private static Column[] aggFunc(String type, String[] cols) {
Column[] columns = new Column[cols.length];
for(int i=0,j=0; i Dataset explode(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data,
@InsightComponentArg(name = "扩展行数据格式", description = "数据格式是数组或者以分隔符隔开的字符串",items = "数组;有分隔符的字符串") String explodeColType,
@InsightComponentArg(name = "扩展行的列名", description = "用于扩展行的列名,多列以分隔符隔开") String explodeCol,
@InsightComponentArg(name = "扩展行后的新列名",description = "扩展行后的新列名,多列以分隔符隔开") String resultCol,
@InsightComponentArg(name = "分隔符",description = "分隔符") String separator) {
if(data == null) {
logger.info("数据集为空");
return data;
}
if(explodeCol == null || explodeCol.trim().length() == 0) {
throw new NullPointerException("扩展行的列名参数为空");
}
if(resultCol == null || resultCol.trim().length() == 0) {
throw new NullPointerException("扩展行后的新列名为空");
}
if(explodeColType == null || explodeColType.trim().length() == 0) {
throw new NullPointerException("扩展行的数据格式为空");
}
Dataset result = null;
if ("数组".equals(explodeColType)) {
StructField[] fields = data.schema().fields();
if(!checkColAndType(fields, explodeCol)) {
throw new IllegalArgumentException("扩展行的列名在数据集中不存在或者类型不匹配");
}
result = (Dataset) data.withColumn(resultCol, functions.explode(col(explodeCol)));
} else if ("有分隔符的字符串".equals(explodeColType)) {
if(separator == null || separator.trim().length() == 0) {
throw new NullPointerException("分隔符参数为空");
}
String separatorDeal = SeparatorUtil.specialCharactDeal(separator);
result = (Dataset) data.withColumn(resultCol, functions.explode(functions.split(col(explodeCol),separatorDeal)));
}
if (result != null && !explodeCol.equals(resultCol)) {
return (Dataset) result.drop(col(explodeCol));
}
return (Dataset) result;
}
private static boolean checkColAndType(StructField[] fields, String col) {
for(StructField field : fields) {
if(field.name().equals(col) && Consts.ARRAY.equals(field.dataType().typeName())) {
System.out.println("type:" + field.dataType().typeName());
return true;
}
}
return false;
}
@InsightComponent(name = "DatasetToRDD", description = "数据集转换成RDD")
public static RDD ds2Rdd(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data) {
if(data == null) {
throw new NullPointerException("数据集为空");
}
return data.rdd();
}
@InsightComponent(name = "window", description = "按照一定参数将多行数据合并为一个窗口,并进行一定处理")
public static Dataset window(@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "分片的列名", description = "分片的列名,以分号隔开") String partitionBys,
@InsightComponentArg(name = "排序的列名", description = "排序的列名,以分号隔开") String orderBys,
@InsightComponentArg(name = "操作类型", description = "用于指定对目标列的操作类型", items = "collect_list") String type,
@InsightComponentArg(name = "目标列", description = "要处理的列名") String targetCol,
@InsightComponentArg(name = "行数", description = "window包含的行数") int rows) {
if(data == null) {
throw new NullPointerException("数据集为空");
}
checkNull(partitionBys, "分片的列名为空");
checkNull(orderBys, "排序的列名为空");
checkNull(type, "操作类型为空");
checkNull(targetCol, "目标列为空");
int num = rows <= 1 ? 1 : rows - 1;
String[] parts = partitionBys.split(Consts.DELIMITER);
String[] orders = orderBys.split(Consts.DELIMITER);
WindowSpec window = Window.partitionBy(string2Column(data, parts)).orderBy(string2Column(data, orders)).rowsBetween(Window.currentRow(), num);
return (Dataset) data.withColumn(targetCol, functions.collect_list(targetCol).over(window));
}
private static void checkNull(String s, String errorMsg) {
if(s == null ||s.trim().length() == 0) {
throw new NullPointerException(errorMsg);
}
}
private static Column[] string2Column(Dataset dataset, String[] as) {
Column[] cols = new Column[as.length];
for(int i = 0; i < as.length; i++) {
cols[i] = dataset.col(as[i]);
}
return cols;
}
@InsightComponent(name = "多维数组合并去重", description = "对于某一列进行多维数组合并并去重,生成一维数组")
public static Dataset wrapArray2Array(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "合并列", description = "用于合并的列,列的类型必须为多维数组") String wrapArrayCol) {
if(data == null) {
throw new NullPointerException("数据集为空");
}
checkNull(wrapArrayCol, "合并列参数为空");
return (Dataset) StockOperation.multiArray2Array(data.toDF(), wrapArrayCol);
}
@InsightComponent(name = "数组转字符串", description = "将数组转换成字符串")
public static Dataset array2String(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "列名", description = "用于转换成String的列名") String arrayCol,
@InsightComponentArg(name = "连接符", description = "连接符") String separater) {
if(data == null) {
throw new NullPointerException("数据集为空");
}
if(arrayCol == null || arrayCol.trim().length() == 0) {
throw new NullPointerException("arrayCol为空");
}
separater = separater == null ? ";" : separater;
StructField[] fields = data.schema().fields();
boolean flag = false ;
for(StructField field : fields) {
if(field.name().equals(arrayCol) && "array".equals(field.dataType().typeName())) {
flag = true;
break;
}
}
if(!flag) {
throw new IllegalArgumentException(arrayCol + "列名不存在或者不为Array类型");
}
return (Dataset) data.withColumn(arrayCol, functions.concat_ws(separater, col(arrayCol)));
}
@InsightComponent(name = "数学运算", description = "对列进行数学运算")
public static Dataset mathCompute(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "列名", description = "用于计算的列名") String xCol,
@InsightComponentArg(name = "运算方法", description = "数学运算方法", defaultValue = "plus", items = "plus;minus;multiply;divide") String method,
@InsightComponentArg(name = "类型", description = "值对应的类型,列名或者常量", defaultValue = "constant", items = "constant;column") String type,
@InsightComponentArg(name = "值", description = "输入值") String yCol) {
if(data == null) {
throw new IllegalArgumentException("数据集为空");
}
if(method == null || method.length() == 0) {
throw new IllegalArgumentException("运算方法为空");
}
if(xCol == null || xCol.length() == 0) {
throw new IllegalArgumentException("列名为空");
}
if(yCol == null || yCol.length() == 0) {
throw new IllegalArgumentException("列名为空");
}
return (Dataset) BasicOperation.mathCompute(data.toDF(), xCol, method, yCol, type, method + "(" + xCol + "," + yCol + ")");
}
@InsightComponent(name = "分组过滤", description = "将某些列进行分组后聚合,再加上某些条件对聚合后的结果加以过滤")
public static Dataset groupFilter(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "分组列", description = "用于分组的列,多个列用分号隔开") String groupColumns,
@InsightComponentArg(name = "聚合函数类型", description = "聚合函数类型", defaultValue = "count", items = "min;max;avg;sum;count;distinct_count;distinct_sum;approx_count_distinct") String aggMethod,
@InsightComponentArg(name = "聚合列名", description = "聚合的列名") String funcColumns,
@InsightComponentArg(name = "比较", description = "数学比较", defaultValue = "大于", items = "大于;不小于;小于;不大于;等于") String compare,
@InsightComponentArg(name = "阈值", description = "用于过滤的阈值") double threshold) {
if(aggMethod == null || aggMethod.trim().length() == 0) {
throw new IllegalArgumentException("agg method 为空");
}
if(groupColumns == null || groupColumns.trim().length() == 0) {
throw new IllegalArgumentException("group column 为空");
}
if(funcColumns == null || funcColumns.trim().length() == 0) {
throw new IllegalArgumentException("func column 为空");
}
long start = System.currentTimeMillis();
System.out.println("aggregate begin");
Dataset groupDs = BasicOperator.aggregate(data, Consts.GROUPBY, groupColumns, aggMethod, funcColumns);
System.out.println("aggregate end");
long time1 = System.currentTimeMillis();
System.out.println("aggregate time: " + (time1 - start));
groupDs.persist();
String[] colStrings = groupColumns.split(";");
Column condition = null;
String[] dataColumns = data.columns();
Column[] result = new Column[dataColumns.length];
for(int i = 0; i < colStrings.length; i++) {
if(condition == null) {
condition = data.col(colStrings[i]).equalTo(groupDs.col(colStrings[i]));
} else {
condition = condition.and(data.col(colStrings[i]).equalTo(groupDs.col(colStrings[i])));
}
}
for(int i = 0; i < dataColumns.length; i++) {
result[i] = data.col(dataColumns[i]);
}
String agg = null;
if("distinct_count".equals(aggMethod)) {
agg = "count(DISTINCT " + funcColumns + ")";
} else if ("distinct_sum".equals(aggMethod)) {
agg = "sum(DISTINCT " + funcColumns + ")";
} else {
agg = aggMethod + "(" + funcColumns + ")";
}
groupDs.unpersist();
// result[dataColumns.length] = groupDs.col(agg);
System.out.println("filter begin");
Dataset filterDs = BasicOperation.groupFilter(groupDs.toDF(), agg, Consts.MathCompare.getType(compare), threshold);
System.out.println("filter end ");
long time2 = System.currentTimeMillis();
System.out.println("filter time: " + (time2 - start));
filterDs.persist();
System.out.println("join begin");
Dataset resultDs = data.join(filterDs, condition, "right");
System.out.println("join end");
long time3 = System.currentTimeMillis();
System.out.println("filter time: " + (time3 - start));
filterDs.unpersist();
return (Dataset) resultDs.select(result);
}
@InsightComponent(name = "count", description = "count")
public static Dataset countRow(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data) {
SparkSession spark = SparkContextBuilder.getSession();
spark.log().info("数据集行数为: " + data.count());
return data;
}
@InsightComponent(name = "时间过滤", description = "时间过滤")
public static Dataset filterDate(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "时间列", description = "用于比较的时间列") String dateColumn,
@InsightComponentArg(name = "起始时间", description = "起始时间") String startDate,
@InsightComponentArg(name = "终止时间", description = "终止时间") String endDate,
@InsightComponentArg(name = "时间格式", description = "时间格式,例如yyyy-MM-dd") String dateFormat) {
if(data == null) {
throw new NullPointerException("数据集为空");
}
if(checkNull(dateColumn)) {
throw new NullPointerException("时间列为空");
}
if(checkNull(dateFormat)) {
throw new NullPointerException("时间格式为空");
}
Column condition = null;
final boolean f1 = checkNull(startDate);
final boolean f2 = checkNull(endDate);
if(f1 && f2) {
return data;
}
if(!f1 && !f2) {
condition = date_format(data.col(dateColumn), dateFormat).geq(date_format(lit(startDate), dateFormat))
.and(date_format(data.col(dateColumn), dateFormat).leq(date_format(lit(endDate), dateFormat)));
}else if(f1) {
condition = date_format(data.col(dateColumn), dateFormat).leq(date_format(lit(endDate), dateFormat));
} else {
condition = date_format(data.col(dateColumn), dateFormat).geq(date_format(lit(startDate), dateFormat));
}
return data.filter(condition);
}
@InsightComponent(name = "去重", description = "去重")
public static Dataset distinct(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data) {
return data.distinct();
}
private static boolean checkNull(String s) {
return s == null || s.trim().length() == 0;
}
@InsightComponent(name = "checkpoint", description = "checkpoint")
public static Dataset checkpoint(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "路径", description = "路径", defaultValue = "${MISC_FOLDER}") String path) {
SparkSession spark = SparkContextBuilder.getSession();
spark.sparkContext().setCheckpointDir(path + "/checkpoint");
return data.checkpoint();
}
}