com.datastax.data.prepare.spark.dataset.DataSetTransformation Maven / Gradle / Ivy
The newest version!
package com.datastax.data.prepare.spark.dataset;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.data.prepare.util.Consts;
import com.datastax.data.prepare.util.SharedMethods;
import org.apache.parquet.Strings;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapPartitionsFunction;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.util.Map;
public class DataSetTransformation implements Operator {
private static final Logger logger = LoggerFactory.getLogger(DataSetTransformation.class);
protected static Dataset join(Dataset left, Dataset right, Column joinExprs,String joinType){
return left.join(right,joinExprs,joinType);
}
// @Deprecated
protected static Dataset> joinWith(Dataset left, Dataset right, Column joinExprs, String joinType){
return left.joinWith(right,joinExprs,joinType);
}
@InsightComponent(name = "Join", description = "Join", type = "com.datastax.insight.dataprprocess.join", icon = "arrows", order = 50010501)
public static Dataset join(
@InsightComponentArg(externalInput = true, name = "左数据集", description = "left dataset") Dataset left,
@InsightComponentArg(externalInput = true, name = "右数据集", description = "right dataset") Dataset right,
@InsightComponentArg(name = "左列名", description = "keyLeft") String keyLeft,
@InsightComponentArg(name = "右列名", description = "keyRight") String keyRight,
@InsightComponentArg(name = "连接方法", description = "join方法", defaultValue = "innner", items = "inner;outer;left_outer;right_outer;left_semi") String joinType) {
if(left == null || right == null) {
logger.info("left或者right数据集为空,返回left数据集");
return left.toDF();
}
if(joinType == null || joinType.length() == 0) {
joinType = "inner";
}
return left.join(right, left.col(keyLeft).equalTo(right.col(keyRight)), joinType);
}
@InsightComponent(name = "JoinMutil", description = "JoinMutil", type = "com.datastax.insight.dataprprocess.join", icon = "arrows")
public static Dataset joinMutil(
@InsightComponentArg(externalInput = true, name = "左数据集", description = "left dataset") Dataset left,
@InsightComponentArg(externalInput = true, name = "右数据集", description = "right dataset") Dataset right,
@InsightComponentArg(name = "左各列名", description = "左各列名,以分号隔开") String keyLefts,
@InsightComponentArg(name = "右各列名", description = "左各列名,以分号隔开") String keyRights,
@InsightComponentArg(name = "连接方法", description = "join方法", defaultValue = "innner", items = "inner;outer;left_outer;right_outer;left_semi") String joinType) {
if(left == null || right == null) {
logger.info("left或者right数据集为空,返回left数据集");
return left.toDF();
}
if(joinType == null || joinType.length() == 0) {
joinType = "inner";
}
Column conditionColumn = null;
String[] keyLeftArray = keyLefts.split(";");
String[] keyRightArray = keyRights.split(";");
for (int i =0;i Dataset[] split(
@InsightComponentArg(externalInput = true, name = "数据集", description = "输入的DataSet") Dataset dataSet,
@InsightComponentArg(name = "数据切分权重", description = "分割比例") String weights){
if(dataSet == null || Strings.isNullOrEmpty(weights)) {
logger.info("数据集为空或者weights为空");
return new Dataset[]{ dataSet };
}
String[] strs = weights.split(Consts.DELIMITER);
double[] doubles = new double[strs.length];
for(int i=0; i Dataset[] split(
@InsightComponentArg(externalInput = true, name = "数据集", description = "输入的DataSet") Dataset dataSet,
@InsightComponentArg(name = "数据切分权重", description = "分割比例")double[] weights){
if(dataSet == null || weights.length == 0) {
logger.info("数据集为空或者weights为空");
return new Dataset[]{ dataSet };
}
return dataSet.randomSplit(weights);
}
@InsightComponent(name = "Union", description = "Union", type = "com.datastax.insight.dataprprocess.union", icon = "arrows", order = 50010502)
public static Dataset union(
@InsightComponentArg(externalInput = true, name = "左数据集", description = "left dataset") Dataset left,
@InsightComponentArg(externalInput = true, name = "右数据集", description = "right dataset") Dataset right) {
if(left == null || right == null) {
logger.info("数据集为空,返回left数据集");
return left;
}
if(left.schema().fieldNames().length != right.schema().fieldNames().length) {
logger.info("left和right数据集的列数不等,不能进行union操作,返回left数据集");
return left;
}
return left.union(right);
}
protected static Dataset as(Dataset dataset,Encoder u){
return dataset.as(u);
}
@InsightComponent(name = "Alias", description = "Alias", type = "com.datastax.insight.dataprprocess.alias", icon = "arrows", order = 50010802)
public static Dataset alias(
@InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "别名", description = "alias") String alias){
return dataset.as(alias);
}
@InsightComponent(name = "排序", description = "Sort", type = "com.datastax.insight.dataprprocess.sort", icon = "arrows", order = 50010601)
public static Dataset sort(
@InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "类型", description = "排序类型", defaultValue = "sort", items = "sort;sortWithinPartitions;orderBy") String type,
@InsightComponentArg(name = "列名", description = "排序列,用分号隔开") String cols,
@InsightComponentArg(name = "排列类型", description = "设置列从大到小或者从小到大排列", defaultValue = "ASC", items = "DESC;ASC") String orderType) {
String[] columns=cols.split(Consts.DELIMITER);
if(columns.length == 0) {
logger.info("选择的列名为空,返回原数据集");
return dataset;
}
if(type == null || type.length() == 0) {
type = Consts.SORT;
}
boolean flag = false;
if(Consts.DESC.equals(orderType)) {
flag = true;
}
Column[] temp = new Column[columns.length];
for(int i = 0; i < columns.length; i++) {
temp[i] = flag ? dataset.col(columns[i]).desc_nulls_last() : dataset.col(columns[i]).asc_nulls_last();
}
if(Consts.SORT.equals(type)) {
return dataset.sort(temp);
}
if(Consts.SORT_WIRHINPARTITIONS.equals(type)) {
return dataset.sortWithinPartitions(temp);
}
if(Consts.ORDERBY.equals(type)) {
return dataset.orderBy(temp);
}
logger.info("type不在可选值内,返回原数据集");
return dataset;
}
@InsightComponent(name = "Select", description = "Select", type = "com.datastax.insight.dataprprocess.select", icon = "arrows", order = 50010102)
public static Dataset select(
@InsightComponentArg(externalInput = true,name = "数据集", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "类型", description = "选择的类型", defaultValue = "column", items = "column;expression") String type,
@InsightComponentArg(name = "列名", description = "选择列,以分号隔开") String cols){
String[] columns = SharedMethods.handleColsWithEmpty(cols.split(Consts.DELIMITER));
if(columns.length == 0 || dataset == null) {
logger.info("选择的列名为空或者数据集为空,返回原数据集");
return dataset.toDF();
}
if(type == null || type.length() == 0) {
type = Consts.COLUMN;
}
if(Consts.COLUMN.equals(type)) {
if(columns.length == 1) {
return (Dataset) dataset.select(columns[0]);
} else {
String[] copy = new String[columns.length-1];
System.arraycopy(columns, 1, copy, 0, columns.length-1);
return dataset.select(columns[0], copy);
}
}
if(Consts.EXPRESSION.equals(type)) {
return dataset.selectExpr(columns);
}
logger.info("type不在可选值内,返回原数据集");
return dataset.toDF();
}
// @Deprecated
@InsightComponent(name = "Filter", description = "Filter", type = "com.datastax.insight.dataprprocess.filter", icon = "arrows", order = 50010201)
public static Dataset filter(
@InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "表达式", description = "表达式") String expr){
if(dataset == null || expr == null || expr.length() == 0) {
logger.info("数据集为空或者表达式为空,返回原数据集");
return dataset;
}
return dataset.filter(expr);
}
protected static Dataset where(Dataset dataset, String expr){
return dataset.where(expr);
}
// agg songfu写了,在 BasicOperator
// @Deprecated
protected static Dataset agg(Dataset dataset,Map map){
return dataset.agg(map);
}
@InsightComponent(name = "Limit", description = "Limit", type = "com.datastax.insight.dataprprocess.limit", icon = "arrows", order = 50010101)
public static Dataset limit(
@InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "行数", description = "limit n") int n) {
if(dataset == null) {
logger.info("数据集为空");
return dataset;
}
if(n < 0) {
logger.info("n小于0, 默认为20");
n = 20;
}
return dataset.limit(n);
}
@InsightComponent(name = "交集", description = "Intersect", type = "com.datastax.insight.dataprprocess.intersect", icon = "arrows", order = 50010505)
public static Dataset intersect(
@InsightComponentArg(externalInput = true, name = "数据集1", description = "操作数据1") Dataset dataset1,
@InsightComponentArg(externalInput = true, name = "数据集2", description = "操作数据2") Dataset dataset2){
if(dataset1 == null || dataset2 == null) {
logger.info("数据集为空,返回null");
return dataset1;
}
return dataset1.intersect(dataset2);
}
@InsightComponent(name = "差集", description = "Except", type = "com.datastax.insight.dataprprocess.expect", icon = "arrows", order = 50010202)
public static Dataset except(
@InsightComponentArg(externalInput = true, name = "数据集1", description = "操作数据1") Dataset dataset1,
@InsightComponentArg(externalInput = true, name = "数据集2", description = "操作数据2") Dataset dataset2) {
if(dataset1 == null || dataset2 == null) {
logger.info("数据集为空,返回null");
return dataset1;
}
return dataset1.except(dataset2);
}
// @Deprecated
protected static Dataset withColumnRenamed(Dataset dataset,String existingName,String newName){
return dataset.withColumnRenamed(existingName,newName);
}
@InsightComponent(name = "Drop", description = "Drop", type = "com.datastax.insight.dataprprocess.drop", icon = "arrows", order = 50010104)
public static Dataset drop(
@InsightComponentArg(externalInput = true, name = "数据集", description = "dataset") Dataset dataset,
@InsightComponentArg(name = "类型", description = "drop 的类型", defaultValue = "drop", items = "drop;drop duplicates") String type,
@InsightComponentArg(name = "列名", description = "丢弃的列名,用分号隔开") String cols){
String[] columns = SharedMethods.handleColsWithEmpty(cols.split(Consts.DELIMITER));
if(type == null || type.length() == 0) {
type = Consts.DROP;
}
if(Consts.DROP.equals(type)) {
if(columns.length == 0) {
logger.info("列名为空, 返回原数据集");
return dataset.toDF();
}
return dataset.drop(columns);
}
if(Consts.DROP_DUPLICATES.equals(type)) {
if(columns.length == 0) {
return dataset.dropDuplicates().toDF();
}
return dataset.dropDuplicates(columns).toDF();
}
logger.info("type不再可选项内, 返回原数据集");
return dataset.toDF();
}
@InsightComponent(name = "Map", description = "Map", type = "com.datastax.insight.dataprprocess.map", icon = "arrows", order = 50010803)
public static Dataset map(
@InsightComponentArg(name = "数据集", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "函数") MapFunction func,
@InsightComponentArg(name = "编码器") Encoder u){
return dataset.map(func,u);
}
protected static Dataset mapPartitions(Dataset dataset, MapPartitionsFunction func, Encoder u){
return dataset.mapPartitions(func,u);
}
protected static Dataset flatMap(Dataset dataset, FlatMapFunction func, Encoder u){
return dataset.flatMap(func,u);
}
@InsightComponent(name = "Repartition", description = "Repartition", type = "com.datastax.insight.dataprprocess.repartition", icon = "arrows", order = 50010504)
public static Dataset repartition(
@InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "n", description = "n") int n){
return dataset.repartition(n);
}
@InsightComponent(name = "Coalesce", description = "Coalesce", type = "com.datastax.insight.dataprprocess.coalesce", icon = "arrows", order = 50010503)
public static Dataset coalesce(
@InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset,
@InsightComponentArg(name = "n", description = "n") int n){
return dataset.coalesce(n);
}
@InsightComponent(name = "toJavaRDD", description = "toJavaRDD", type = "com.datastax.insight.dataprprocess.toJavaRDD", icon = "arrows", order = 500204)
public static JavaRDD toJavaRDD(
@InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset){
return dataset.toJavaRDD();
}
@InsightComponent(name = "toJson", description = "toJson", type = "com.datastax.insight.dataprprocess.toJson", icon = "arrows", order = 500203)
public static Dataset toJSON(
@InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset){
return dataset.toJSON();
}
protected static Dataset toDF(Dataset dataset){
return dataset.toDF();
}
protected static Dataset toDF(Dataset dataset,String[] cols){
return dataset.toDF(cols);
}
// @Deprecated
@InsightComponent(name = "toDF", description = "toDF", type = "com.datastax.insight.dataprprocess.toDF", icon = "arrows", order = 500201)
public static Dataset toDF(
@InsightComponentArg(externalInput = true, name = "数据集") Dataset dataset,
@InsightComponentArg(name = "列名") String cols){
if(dataset == null) {
return dataset.toDF();
}
String[] columns = new String[dataset.columns().length];
if(cols == null || cols.length() == 0) {
String preffix = "_c";
for(int i=0; i Dataset cast(Dataset dataset,String column, String type){
String[] columns = column.split(Consts.DELIMITER);
Dataset result = null;
for (String c : columns) {
if(result == null) {
result = dataset.withColumn(c, dataset.col(c).cast(type));
} else {
result = result.withColumn(c, dataset.col(c).cast(type));
}
}
return result == null ? dataset.toDF() : result;
}
@InsightComponent(name = "Show", description = "Show", type = "com.datastax.insight.dataprprocess.show")
public static void show(
@InsightComponentArg(name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "行数", description = "显示的行数", defaultValue = "10") int numRows,
@InsightComponentArg(name = "字符数", description = "显示的字符数", defaultValue = "20") int truncate) {
if(data == null) {
logger.info("数据集为空");
return ;
}
numRows = numRows < 0 ? 10 : numRows;
truncate = truncate < 0 ? 20 : truncate;
System.out.println("===DataExa-Insight User Output Started===");
data.show(numRows, truncate);
System.out.println("===DataExa-Insight User Output Ended===");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy