All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.data.prepare.spark.dataset.DataSetTransformation Maven / Gradle / Ivy

The newest version!
package com.datastax.data.prepare.spark.dataset;

import com.datastax.insight.spec.Operator;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.data.prepare.util.Consts;
import com.datastax.data.prepare.util.SharedMethods;
import org.apache.parquet.Strings;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapPartitionsFunction;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;

import java.util.Map;

public class DataSetTransformation implements Operator {
    private static final Logger logger = LoggerFactory.getLogger(DataSetTransformation.class);
    protected static  Dataset join(Dataset left, Dataset right, Column joinExprs,String joinType){
        return left.join(right,joinExprs,joinType);
    }

//    @Deprecated
    protected static  Dataset> joinWith(Dataset left, Dataset right, Column joinExprs, String joinType){
        return left.joinWith(right,joinExprs,joinType);
    }

    @InsightComponent(name = "Join", description = "Join", type = "com.datastax.insight.dataprprocess.join", icon = "arrows", order = 50010501)
    public static  Dataset join(
            @InsightComponentArg(externalInput = true, name = "左数据集", description = "left dataset") Dataset left,
            @InsightComponentArg(externalInput = true, name = "右数据集", description = "right dataset") Dataset right,
            @InsightComponentArg(name = "左列名", description = "keyLeft") String keyLeft,
            @InsightComponentArg(name = "右列名", description = "keyRight") String keyRight,
            @InsightComponentArg(name = "连接方法", description = "join方法", defaultValue = "innner", items = "inner;outer;left_outer;right_outer;left_semi") String joinType) {
        if(left == null || right == null) {
            logger.info("left或者right数据集为空,返回left数据集");
            return left.toDF();
        }
        if(joinType == null || joinType.length() == 0) {
            joinType = "inner";
        }
        return left.join(right, left.col(keyLeft).equalTo(right.col(keyRight)), joinType);
    }

    @InsightComponent(name = "JoinMutil", description = "JoinMutil", type = "com.datastax.insight.dataprprocess.join", icon = "arrows")
    public static  Dataset joinMutil(
            @InsightComponentArg(externalInput = true, name = "左数据集", description = "left dataset") Dataset left,
            @InsightComponentArg(externalInput = true, name = "右数据集", description = "right dataset") Dataset right,
            @InsightComponentArg(name = "左各列名", description = "左各列名,以分号隔开") String keyLefts,
            @InsightComponentArg(name = "右各列名", description = "左各列名,以分号隔开") String keyRights,
            @InsightComponentArg(name = "连接方法", description = "join方法", defaultValue = "innner", items = "inner;outer;left_outer;right_outer;left_semi") String joinType) {
        if(left == null || right == null) {
            logger.info("left或者right数据集为空,返回left数据集");
            return left.toDF();
        }
        if(joinType == null || joinType.length() == 0) {
            joinType = "inner";
        }
        Column conditionColumn = null;
        String[] keyLeftArray = keyLefts.split(";");
        String[] keyRightArray = keyRights.split(";");
        for (int i =0;i Dataset[] split(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "输入的DataSet") Dataset dataSet,
            @InsightComponentArg(name = "数据切分权重", description = "分割比例") String weights){
        if(dataSet == null || Strings.isNullOrEmpty(weights)) {
            logger.info("数据集为空或者weights为空");
            return new Dataset[]{ dataSet };
        }
        String[] strs = weights.split(Consts.DELIMITER);
        double[] doubles = new double[strs.length];
        for(int i=0; i Dataset[] split(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "输入的DataSet") Dataset dataSet,
            @InsightComponentArg(name = "数据切分权重", description = "分割比例")double[] weights){
        if(dataSet == null || weights.length == 0) {
            logger.info("数据集为空或者weights为空");
            return new Dataset[]{ dataSet };
        }
        return dataSet.randomSplit(weights);
    }

    @InsightComponent(name = "Union", description = "Union", type = "com.datastax.insight.dataprprocess.union", icon = "arrows", order = 50010502)
    public static  Dataset union(
            @InsightComponentArg(externalInput = true, name = "左数据集", description = "left dataset") Dataset left,
            @InsightComponentArg(externalInput = true, name = "右数据集", description = "right dataset") Dataset right) {
        if(left == null || right == null) {
            logger.info("数据集为空,返回left数据集");
            return left;
        }
        if(left.schema().fieldNames().length != right.schema().fieldNames().length) {
            logger.info("left和right数据集的列数不等,不能进行union操作,返回left数据集");
            return left;
        }
        return left.union(right);
    }

    protected static  Dataset as(Dataset dataset,Encoder u){
        return dataset.as(u);
    }

    @InsightComponent(name = "Alias", description = "Alias", type = "com.datastax.insight.dataprprocess.alias", icon = "arrows", order = 50010802)
    public static  Dataset alias(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "别名", description = "alias") String alias){
        return dataset.as(alias);
    }

    @InsightComponent(name = "排序", description = "Sort", type = "com.datastax.insight.dataprprocess.sort", icon = "arrows", order = 50010601)
    public static  Dataset sort(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "类型", description = "排序类型", defaultValue = "sort", items = "sort;sortWithinPartitions;orderBy") String type,
            @InsightComponentArg(name = "列名", description = "排序列,用分号隔开") String cols,
            @InsightComponentArg(name = "排列类型", description = "设置列从大到小或者从小到大排列", defaultValue = "ASC", items = "DESC;ASC") String orderType) {
        String[] columns=cols.split(Consts.DELIMITER);
        if(columns.length == 0) {
            logger.info("选择的列名为空,返回原数据集");
            return dataset;
        }
        if(type == null || type.length() == 0) {
            type = Consts.SORT;
        }
        boolean flag = false;
        if(Consts.DESC.equals(orderType)) {
            flag = true;
        }
        Column[] temp = new Column[columns.length];
        for(int i = 0; i < columns.length; i++) {
            temp[i] = flag ? dataset.col(columns[i]).desc_nulls_last() : dataset.col(columns[i]).asc_nulls_last();
        }
        if(Consts.SORT.equals(type)) {
            return dataset.sort(temp);
        }
        if(Consts.SORT_WIRHINPARTITIONS.equals(type)) {
            return dataset.sortWithinPartitions(temp);
        }
        if(Consts.ORDERBY.equals(type)) {
            return dataset.orderBy(temp);
        }
        logger.info("type不在可选值内,返回原数据集");
        return dataset;
    }

    @InsightComponent(name = "Select", description = "Select", type = "com.datastax.insight.dataprprocess.select", icon = "arrows", order = 50010102)
    public static  Dataset select(
            @InsightComponentArg(externalInput = true,name = "数据集", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "类型", description = "选择的类型", defaultValue = "column", items = "column;expression") String type,
            @InsightComponentArg(name = "列名", description = "选择列,以分号隔开") String cols){
        String[] columns = SharedMethods.handleColsWithEmpty(cols.split(Consts.DELIMITER));
        if(columns.length == 0 || dataset == null) {
            logger.info("选择的列名为空或者数据集为空,返回原数据集");
            return dataset.toDF();
        }
        if(type == null || type.length() == 0) {
            type = Consts.COLUMN;
        }
        if(Consts.COLUMN.equals(type)) {
            if(columns.length == 1) {
                return (Dataset) dataset.select(columns[0]);
            } else {
                String[] copy = new String[columns.length-1];
                System.arraycopy(columns, 1, copy, 0, columns.length-1);
                return dataset.select(columns[0], copy);
            }
        }
        if(Consts.EXPRESSION.equals(type)) {
            return dataset.selectExpr(columns);
        }
        logger.info("type不在可选值内,返回原数据集");
        return dataset.toDF();
    }

//    @Deprecated
    @InsightComponent(name = "Filter", description = "Filter", type = "com.datastax.insight.dataprprocess.filter", icon = "arrows", order = 50010201)
    public static  Dataset filter(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "表达式", description = "表达式") String expr){
        if(dataset == null || expr == null || expr.length() == 0) {
            logger.info("数据集为空或者表达式为空,返回原数据集");
            return dataset;
        }
        return dataset.filter(expr);
    }

    protected static  Dataset where(Dataset dataset, String expr){
        return dataset.where(expr);
    }

//  agg songfu写了,在 BasicOperator
//    @Deprecated
    protected static  Dataset agg(Dataset dataset,Map map){
        return dataset.agg(map);
    }

    @InsightComponent(name = "Limit", description = "Limit", type = "com.datastax.insight.dataprprocess.limit", icon = "arrows", order = 50010101)
    public static  Dataset limit(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "行数", description = "limit n") int n) {
        if(dataset == null) {
            logger.info("数据集为空");
            return dataset;
        }
        if(n < 0) {
            logger.info("n小于0, 默认为20");
            n = 20;
        }
        return dataset.limit(n);
    }

    @InsightComponent(name = "交集", description = "Intersect", type = "com.datastax.insight.dataprprocess.intersect", icon = "arrows", order = 50010505)
    public static  Dataset intersect(
            @InsightComponentArg(externalInput = true, name = "数据集1", description = "操作数据1") Dataset dataset1,
            @InsightComponentArg(externalInput = true, name = "数据集2", description = "操作数据2") Dataset dataset2){
        if(dataset1 == null || dataset2 == null) {
            logger.info("数据集为空,返回null");
            return dataset1;
        }
        return dataset1.intersect(dataset2);
    }

    @InsightComponent(name = "差集", description = "Except", type = "com.datastax.insight.dataprprocess.expect", icon = "arrows", order = 50010202)
    public static  Dataset except(
            @InsightComponentArg(externalInput = true, name = "数据集1", description = "操作数据1") Dataset dataset1,
            @InsightComponentArg(externalInput = true, name = "数据集2", description = "操作数据2") Dataset dataset2) {
        if(dataset1 == null || dataset2 == null) {
            logger.info("数据集为空,返回null");
            return dataset1;
        }
        return dataset1.except(dataset2);
    }

//    @Deprecated
    protected static  Dataset withColumnRenamed(Dataset dataset,String existingName,String newName){
        return dataset.withColumnRenamed(existingName,newName);
    }

    @InsightComponent(name = "Drop", description = "Drop", type = "com.datastax.insight.dataprprocess.drop", icon = "arrows", order = 50010104)
    public static  Dataset drop(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "dataset") Dataset dataset,
            @InsightComponentArg(name = "类型", description = "drop 的类型", defaultValue = "drop", items = "drop;drop duplicates") String type,
            @InsightComponentArg(name = "列名", description = "丢弃的列名,用分号隔开") String cols){
        String[] columns = SharedMethods.handleColsWithEmpty(cols.split(Consts.DELIMITER));
        if(type == null || type.length() == 0) {
            type = Consts.DROP;
        }
        if(Consts.DROP.equals(type)) {
            if(columns.length == 0) {
                logger.info("列名为空, 返回原数据集");
                return dataset.toDF();
            }
            return dataset.drop(columns);
        }
        if(Consts.DROP_DUPLICATES.equals(type)) {
            if(columns.length == 0) {
                return dataset.dropDuplicates().toDF();
            }
            return dataset.dropDuplicates(columns).toDF();
        }
        logger.info("type不再可选项内, 返回原数据集");
        return dataset.toDF();
    }

    @InsightComponent(name = "Map", description = "Map", type = "com.datastax.insight.dataprprocess.map", icon = "arrows", order = 50010803)
    public static  Dataset map(
            @InsightComponentArg(name = "数据集", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "函数") MapFunction func,
            @InsightComponentArg(name = "编码器") Encoder u){
        return dataset.map(func,u);
    }

    protected static  Dataset mapPartitions(Dataset dataset, MapPartitionsFunction func, Encoder u){
        return dataset.mapPartitions(func,u);
    }

    protected static  Dataset flatMap(Dataset dataset, FlatMapFunction func, Encoder u){
        return dataset.flatMap(func,u);
    }

    @InsightComponent(name = "Repartition", description = "Repartition", type = "com.datastax.insight.dataprprocess.repartition", icon = "arrows", order = 50010504)
    public static  Dataset repartition(
            @InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "n", description = "n") int n){
        return dataset.repartition(n);
    }

    @InsightComponent(name = "Coalesce", description = "Coalesce", type = "com.datastax.insight.dataprprocess.coalesce", icon = "arrows", order = 50010503)
    public static  Dataset coalesce(
            @InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset,
            @InsightComponentArg(name = "n", description = "n") int n){
        return dataset.coalesce(n);
    }

    @InsightComponent(name = "toJavaRDD", description = "toJavaRDD", type = "com.datastax.insight.dataprprocess.toJavaRDD", icon = "arrows", order = 500204)
    public static  JavaRDD toJavaRDD(
            @InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset){
        return dataset.toJavaRDD();
    }

    @InsightComponent(name = "toJson", description = "toJson", type = "com.datastax.insight.dataprprocess.toJson", icon = "arrows", order = 500203)
    public static  Dataset toJSON(
            @InsightComponentArg(externalInput = true, name = "data", description = "操作数据") Dataset dataset){
        return dataset.toJSON();
    }

    protected static  Dataset toDF(Dataset dataset){
        return dataset.toDF();
    }

    protected static  Dataset toDF(Dataset dataset,String[] cols){
        return dataset.toDF(cols);
    }

//    @Deprecated
    @InsightComponent(name = "toDF", description = "toDF", type = "com.datastax.insight.dataprprocess.toDF", icon = "arrows", order = 500201)
    public static  Dataset toDF(
            @InsightComponentArg(externalInput = true, name = "数据集") Dataset dataset,
            @InsightComponentArg(name = "列名") String cols){
        if(dataset == null) {
            return dataset.toDF();
        }
        String[] columns = new String[dataset.columns().length];
        if(cols == null || cols.length() == 0) {
            String preffix = "_c";
            for(int i=0; i Dataset cast(Dataset dataset,String column, String type){
        String[] columns = column.split(Consts.DELIMITER);
        Dataset result = null;
        for (String c : columns) {
            if(result == null) {
                result = dataset.withColumn(c, dataset.col(c).cast(type));
            } else {
                result = result.withColumn(c, dataset.col(c).cast(type));
            }
        }
        return result == null ? dataset.toDF() : result;
    }

    @InsightComponent(name = "Show", description = "Show", type = "com.datastax.insight.dataprprocess.show")
    public static  void show(
            @InsightComponentArg(name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "行数", description = "显示的行数", defaultValue = "10") int numRows,
            @InsightComponentArg(name = "字符数", description = "显示的字符数", defaultValue = "20") int truncate) {
        if(data == null) {
            logger.info("数据集为空");
            return ;
        }
        numRows = numRows < 0 ? 10 : numRows;
        truncate = truncate < 0 ? 20 : truncate;
        System.out.println("===DataExa-Insight User Output Started===");
        data.show(numRows, truncate);
        System.out.println("===DataExa-Insight User Output Ended===");
    }


}