com.datastax.data.prepare.spark.dataset.BasicOperator Maven / Gradle / Ivy

Go to download
package com.datastax.data.prepare.spark.dataset;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.data.prepare.util.*;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.spec.Operator;
import com.datastax.data.prepare.spark.dataset.params.Aggregate;
import com.google.common.base.Strings;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import org.apache.spark.sql.types.StructField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;

import static org.apache.spark.sql.functions.*;
import org.apache.spark.sql.functions.*;

public class BasicOperator implements Operator {
    private static final Logger logger = LoggerFactory.getLogger(BasicOperator.class);

    @InsightComponent( name = "generateID", type = "com.datastax.insight.dataprprocess.generateID", description = "ID生成", order = 500801 )
    public static  Dataset generateID(
            @InsightComponentArg(externalInput = true, name = "data", description = "数据集") Dataset data,
            @InsightComponentArg(name = "IDName", description = "ID属性名称", request = true) String IDName,
            @InsightComponentArg(name = "IDType", description = "ID类型", request = true, items = "UUID32;Increment", defaultValue = "UUID32") String IDType) {
        if( Strings.isNullOrEmpty(IDName) || data == null){
            logger.info("IDName is empty");
            return data;
        }
        if( Consts.INCREMENT.equals( IDType ) ){
            return (Dataset) data.withColumn(IDName, functions.lit(functions.monotonically_increasing_id()));
        } else {
            SparkSession sparkSession = SparkContextBuilder.getSession();
            SharedUDFs.uuid(sparkSession.udf());
            return (Dataset) data.withColumn(IDName, functions.callUDF("uuid", data.col(data.columns()[0])));
        }
    }

//    @Deprecated
    @InsightComponent( name = "重命名", type = "com.datastax.insight.dataprprocess.rename", description = "重命名", order = 500107 )
    public static  Dataset rename(
            @InsightComponentArg(externalInput = true,name = "dataset",description = "数据集") Dataset dataset,
            @InsightComponentArg(name = "oldName",description = "属性原来的名字",request = true) String oldName,
            @InsightComponentArg(name = "newName",description = "属性的新名字",request = true) String newName) {
        if( Strings.isNullOrEmpty(oldName) || Strings.isNullOrEmpty(newName) || dataset == null) {
            logger.info("oldName or newname is empty!");
            return dataset;
        }
        String[] oldNames = SharedMethods.handleColsWithEmpty(oldName.split(Consts.DELIMITER));
        String[] newNames = SharedMethods.handleColsWithEmpty(newName.split(Consts.DELIMITER));
        //判断切割后oldname和newname的个数是否一样
        if (oldNames.length != newNames.length) {
            logger.info("The number of newname is not the same as the number of oldname");
            return dataset;
        }
        for (String aNewName : newNames) {
            if (Strings.isNullOrEmpty(aNewName)) {
                logger.info("newName is empty!");
                return dataset;
            }
        }
        //进行重命名
        for (int i = 0; i < newNames.length; i++) {
            dataset = (Dataset) dataset.withColumnRenamed(oldNames[i], newNames[i]);
        }
        return dataset;
    }

    @InsightComponent(name = "命名", description = "命名组件, 包含重命名和初始化命名")
    public static  Dataset name(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "参数", description = "命名参数") JSONObject object) {
        if(object.isEmpty()) {
            logger.info("命名组件的参数为空,返回原数据集");
            return data;
        }
        if(data == null) {
            logger.info("命名组件参数中的数据集为空，返回null");
            return null;
        }
        String type = object.getString("selector");
        StructField[] fields = data.schema().fields();
        if(Consts.INITIALIZA_NAME.equals(type)) {
            String initialType = object.getString("selectorValue");
            String value = object.getString("method").trim();
            if(value.length() == 0) {
                logger.info("命名组件的初始化命名的value为空, 默认设为列名自动生成, 前缀为_c");
                value = "_c";
                initialType = Consts.AUTO;
            }
            if(Consts.AUTO.equals(initialType)) {
                String[] columns = new String[fields.length];
                for(int i=0; i) data.toDF(columns);
            }
            if(Consts.MANUAL.equals(initialType)) {
                String[] columns =  SharedMethods.handleColsWithEmpty(value.split(Consts.DELIMITER));
                if(fields.length != columns.length) {
                    throw new CustomException("初始化命名的列名个数和数据集的列名个数不相等");
                }
                return (Dataset) data.toDF(columns);
            }
        }
        if(Consts.RENAME.equals(type)) {
            JSONArray array = object.getJSONArray("selectorValue");
            for(int i=0; i) data.withColumnRenamed(oldName, newName);
            }
            return data;
        }

        return null;
    }


    @InsightComponent( name = "聚合", description = "聚合" )
    public static  Dataset aggregate(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "分组类型", description = "分组类型", defaultValue = "groupBy", items = "groupBy;rollup;cube") String type,
            @InsightComponentArg(name = "分组列名", description = "分组的列名,用;隔开") String columns,
            @InsightComponentArg(name = "聚合函数类型", description = "聚合函数类型", defaultValue = "sum", items = "min;max;avg;sum;count;collect_list;collect_set;distinct_count;distinct_sum;approx_count_distinct") String funcType,
            @InsightComponentArg(name = "聚合列名", description = "聚合的列名,用;隔开") String funcColumns) {
        Aggregate aggregate = new Aggregate(type, columns, funcType, funcColumns);
        return aggregate(data, aggregate);
    }

    protected static  Dataset aggregate(Dataset data, Aggregate aggregate) {
        if(aggregate.getFuncColumns() == null || aggregate.getFuncColumns().length() == 0) {
            logger.info("聚合选中的列名数组为空，返回原数据集");
            return data;
        }
        if(aggregate.getType() == null || aggregate.getType().length() == 0) {
            aggregate.setType(Consts.GROUPBY);
        }
        if(aggregate.getFuncType() == null || aggregate.getFuncType().length() == 0) {
            aggregate.setFuncType(Consts.SUM);
        }
        boolean groupFlag = Strings.isNullOrEmpty(aggregate.getColumns());
        Column[] groupColumns = aggFunc(null, SharedMethods.handleColsWithEmpty(aggregate.getColumns().split(Consts.DELIMITER)));
        Column[] aggColumns = aggFunc(aggregate.getFuncType(), SharedMethods.handleColsWithEmpty(aggregate.getFuncColumns().split(Consts.DELIMITER)));
        boolean aggFlag = aggColumns.length == 1;
        Column[] copy = null;
        if(!aggFlag) {
            copy = new Column[aggColumns.length-1];
            if(aggColumns.length == 0) {
                throw new NullPointerException("addColumns为空");
            }
            System.arraycopy(aggColumns, 1, copy, 0, aggColumns.length-1);
        }
        if(Consts.GROUPBY.equals(aggregate.getType())) {
            data = (Dataset) (groupFlag ? aggFlag ? data.groupBy().agg(aggColumns[0]) : data.groupBy().agg(aggColumns[0], copy) :
                    aggFlag ? data.groupBy(groupColumns).agg(aggColumns[0]) : data.groupBy(groupColumns).agg(aggColumns[0], copy));
        }
        if(Consts.ROLLUP.equals(aggregate.getType())) {
            data = (Dataset) (groupFlag ? aggFlag ? data.rollup().agg(aggColumns[0]) : data.rollup().agg(aggColumns[0], copy) :
                    aggFlag ? data.rollup(groupColumns).agg(aggColumns[0]) : data.rollup(groupColumns).agg(aggColumns[0], copy));
        }
        if(Consts.CUBE.equals(aggregate.getType())) {
            data = (Dataset) (groupFlag ? aggFlag ? data.cube().agg(aggColumns[0]) : data.cube().agg(aggColumns[0], copy) :
                    aggFlag ? data.cube(groupColumns).agg(aggColumns[0]) : data.cube(groupColumns).agg(aggColumns[0], copy));
        }
        return data;
    }

    private static Column[] aggFunc(String type, String[] cols) {
        Column[] columns = new Column[cols.length];
        for(int i=0,j=0; i Dataset explode(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data,
            @InsightComponentArg(name = "扩展行数据格式", description = "数据格式是数组或者以分隔符隔开的字符串",items = "数组;有分隔符的字符串") String explodeColType,
            @InsightComponentArg(name = "扩展行的列名", description = "用于扩展行的列名,多列以分隔符隔开") String explodeCol,
            @InsightComponentArg(name = "扩展行后的新列名",description = "扩展行后的新列名,多列以分隔符隔开") String resultCol,
            @InsightComponentArg(name = "分隔符",description = "分隔符") String  separator) {
        if(data == null) {
            logger.info("数据集为空");
            return data;
        }
        if(explodeCol == null || explodeCol.trim().length() == 0) {
            throw new NullPointerException("扩展行的列名参数为空");
        }
        if(resultCol == null || resultCol.trim().length() == 0) {
            throw new NullPointerException("扩展行后的新列名为空");
        }
        if(explodeColType == null || explodeColType.trim().length() == 0) {
            throw new NullPointerException("扩展行的数据格式为空");
        }
        Dataset result = null;
        if ("数组".equals(explodeColType)) {
            StructField[] fields = data.schema().fields();
            if(!checkColAndType(fields, explodeCol)) {
                throw new IllegalArgumentException("扩展行的列名在数据集中不存在或者类型不匹配");
            }
            result =  (Dataset) data.withColumn(resultCol, functions.explode(col(explodeCol)));
        } else if ("有分隔符的字符串".equals(explodeColType)) {
            if(separator == null || separator.trim().length() == 0) {
                throw new NullPointerException("分隔符参数为空");
            }
            String separatorDeal = SeparatorUtil.specialCharactDeal(separator);
            result =  (Dataset) data.withColumn(resultCol, functions.explode(functions.split(col(explodeCol),separatorDeal)));
        }
        if (result != null && !explodeCol.equals(resultCol)) {
            return (Dataset) result.drop(col(explodeCol));
        }
        return (Dataset) result;
    }

    private static boolean checkColAndType(StructField[] fields, String col) {
        for(StructField field : fields) {
            if(field.name().equals(col) && Consts.ARRAY.equals(field.dataType().typeName())) {
                System.out.println("type:" + field.dataType().typeName());
                return true;
            }
        }
        return false;
    }

    @InsightComponent(name = "DatasetToRDD", description = "数据集转换成RDD")
    public static  RDD ds2Rdd(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data) {
        if(data == null) {
            throw new NullPointerException("数据集为空");
        }
        return data.rdd();
    }

    @InsightComponent(name = "window", description = "按照一定参数将多行数据合并为一个窗口，并进行一定处理")
    public static  Dataset window(@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
                                        @InsightComponentArg(name = "分片的列名", description = "分片的列名，以分号隔开") String partitionBys,
                                        @InsightComponentArg(name = "排序的列名", description = "排序的列名，以分号隔开") String orderBys,
                                        @InsightComponentArg(name = "操作类型", description = "用于指定对目标列的操作类型", items = "collect_list") String type,
                                        @InsightComponentArg(name = "目标列", description = "要处理的列名") String targetCol,
                                        @InsightComponentArg(name = "行数", description = "window包含的行数") int rows) {
        if(data == null) {
            throw new NullPointerException("数据集为空");
        }
        checkNull(partitionBys, "分片的列名为空");
        checkNull(orderBys, "排序的列名为空");
        checkNull(type, "操作类型为空");
        checkNull(targetCol, "目标列为空");
        int num = rows <= 1 ? 1 : rows - 1;
        String[] parts = partitionBys.split(Consts.DELIMITER);
        String[] orders = orderBys.split(Consts.DELIMITER);

        WindowSpec window = Window.partitionBy(string2Column(data, parts)).orderBy(string2Column(data, orders)).rowsBetween(Window.currentRow(), num);
        return (Dataset) data.withColumn(targetCol, functions.collect_list(targetCol).over(window));
    }

    private static void checkNull(String s, String errorMsg) {
        if(s == null ||s.trim().length() == 0) {
            throw new NullPointerException(errorMsg);
        }
    }

    private static Column[] string2Column(Dataset dataset, String[] as) {
        Column[] cols = new Column[as.length];
        for(int i = 0; i < as.length; i++) {
            cols[i] = dataset.col(as[i]);
        }
        return cols;
    }

    @InsightComponent(name = "多维数组合并去重", description = "对于某一列进行多维数组合并并去重，生成一维数组")
    public static  Dataset wrapArray2Array(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "合并列", description = "用于合并的列，列的类型必须为多维数组") String wrapArrayCol) {
        if(data == null) {
            throw new NullPointerException("数据集为空");
        }
        checkNull(wrapArrayCol, "合并列参数为空");
        return (Dataset) StockOperation.multiArray2Array(data.toDF(), wrapArrayCol);

    }

    @InsightComponent(name = "数组转字符串", description = "将数组转换成字符串")
    public static  Dataset array2String(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "列名", description = "用于转换成String的列名") String arrayCol,
            @InsightComponentArg(name = "连接符", description = "连接符") String separater) {
        if(data == null) {
            throw new NullPointerException("数据集为空");
        }
        if(arrayCol == null || arrayCol.trim().length() == 0) {
            throw new NullPointerException("arrayCol为空");
        }
        separater = separater == null ? ";" : separater;
        StructField[] fields = data.schema().fields();
        boolean flag = false ;
        for(StructField field : fields) {
            if(field.name().equals(arrayCol) && "array".equals(field.dataType().typeName())) {
                flag = true;
                break;
            }
        }
        if(!flag) {
            throw new IllegalArgumentException(arrayCol + "列名不存在或者不为Array类型");
        }
        return (Dataset) data.withColumn(arrayCol, functions.concat_ws(separater, col(arrayCol)));
    }

    @InsightComponent(name = "数学运算", description = "对列进行数学运算")
    public static  Dataset mathCompute(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "列名", description = "用于计算的列名") String xCol,
            @InsightComponentArg(name = "运算方法", description = "数学运算方法", defaultValue = "plus", items = "plus;minus;multiply;divide") String method,
            @InsightComponentArg(name = "类型", description = "值对应的类型，列名或者常量", defaultValue = "constant", items = "constant;column") String type,
            @InsightComponentArg(name = "值", description = "输入值") String yCol) {
        if(data == null) {
            throw new IllegalArgumentException("数据集为空");
        }
        if(method == null || method.length() == 0) {
            throw new IllegalArgumentException("运算方法为空");
        }
        if(xCol == null || xCol.length() == 0) {
            throw new IllegalArgumentException("列名为空");
        }
        if(yCol == null || yCol.length() == 0) {
            throw new IllegalArgumentException("列名为空");
        }
        return (Dataset) BasicOperation.mathCompute(data.toDF(), xCol, method, yCol, type, method + "(" + xCol + "," + yCol + ")");
    }

    @InsightComponent(name = "分组过滤", description = "将某些列进行分组后聚合，再加上某些条件对聚合后的结果加以过滤")
    public static  Dataset groupFilter(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "分组列", description = "用于分组的列，多个列用分号隔开") String groupColumns,
            @InsightComponentArg(name = "聚合函数类型", description = "聚合函数类型", defaultValue = "count", items = "min;max;avg;sum;count;distinct_count;distinct_sum;approx_count_distinct") String aggMethod,
            @InsightComponentArg(name = "聚合列名", description = "聚合的列名") String funcColumns,
            @InsightComponentArg(name = "比较", description = "数学比较", defaultValue = "大于", items = "大于;不小于;小于;不大于;等于") String compare,
            @InsightComponentArg(name = "阈值", description = "用于过滤的阈值") double threshold) {
        if(aggMethod == null || aggMethod.trim().length() == 0) {
            throw new IllegalArgumentException("agg method 为空");
        }
        if(groupColumns == null || groupColumns.trim().length() == 0) {
            throw new IllegalArgumentException("group column 为空");
        }
        if(funcColumns == null || funcColumns.trim().length() == 0) {
            throw new IllegalArgumentException("func column 为空");
        }
        long start  = System.currentTimeMillis();
        System.out.println("aggregate begin");
        Dataset groupDs = BasicOperator.aggregate(data, Consts.GROUPBY, groupColumns, aggMethod, funcColumns);
        System.out.println("aggregate end");
        long time1 = System.currentTimeMillis();
        System.out.println("aggregate time: " + (time1 - start));
        groupDs.persist();

        String[] colStrings = groupColumns.split(";");
        Column condition = null;
        String[] dataColumns = data.columns();
        Column[] result = new Column[dataColumns.length];
        for(int i = 0; i < colStrings.length; i++) {
            if(condition == null) {
                condition = data.col(colStrings[i]).equalTo(groupDs.col(colStrings[i]));
            } else {
                condition = condition.and(data.col(colStrings[i]).equalTo(groupDs.col(colStrings[i])));
            }
        }
        for(int i = 0; i < dataColumns.length; i++) {
            result[i] = data.col(dataColumns[i]);
        }
        String agg = null;
        if("distinct_count".equals(aggMethod)) {
            agg = "count(DISTINCT " + funcColumns + ")";
        } else if ("distinct_sum".equals(aggMethod)) {
            agg = "sum(DISTINCT " + funcColumns + ")";
        } else {
            agg = aggMethod + "(" + funcColumns + ")";
        }

        groupDs.unpersist();
//        result[dataColumns.length] = groupDs.col(agg);
        System.out.println("filter begin");
        Dataset filterDs = BasicOperation.groupFilter(groupDs.toDF(), agg, Consts.MathCompare.getType(compare), threshold);
        System.out.println("filter end ");
        long time2 = System.currentTimeMillis();
        System.out.println("filter time: " + (time2 - start));
        filterDs.persist();
        System.out.println("join begin");
        Dataset resultDs = data.join(filterDs, condition, "right");
        System.out.println("join end");
        long time3 = System.currentTimeMillis();
        System.out.println("filter time: " + (time3 - start));
        filterDs.unpersist();

        return (Dataset) resultDs.select(result);


    }

    @InsightComponent(name = "count", description = "count")
    public static  Dataset countRow(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data) {
        SparkSession spark = SparkContextBuilder.getSession();
        spark.log().info("数据集行数为： " + data.count());
        return data;
    }

    @InsightComponent(name = "时间过滤", description = "时间过滤")
    public static  Dataset filterDate(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "时间列", description = "用于比较的时间列") String dateColumn,
            @InsightComponentArg(name = "起始时间", description = "起始时间") String startDate,
            @InsightComponentArg(name = "终止时间", description = "终止时间") String endDate,
            @InsightComponentArg(name = "时间格式", description = "时间格式，例如yyyy-MM-dd") String dateFormat) {
        if(data == null) {
            throw new NullPointerException("数据集为空");
        }
        if(checkNull(dateColumn)) {
            throw new NullPointerException("时间列为空");
        }
        if(checkNull(dateFormat)) {
            throw new NullPointerException("时间格式为空");
        }
        Column condition = null;
        final boolean f1 = checkNull(startDate);
        final boolean f2 = checkNull(endDate);
        if(f1 && f2) {
            return data;
        }
        if(!f1 && !f2) {
            condition = date_format(data.col(dateColumn), dateFormat).geq(date_format(lit(startDate), dateFormat))
                    .and(date_format(data.col(dateColumn), dateFormat).leq(date_format(lit(endDate), dateFormat)));
        }else if(f1) {
            condition = date_format(data.col(dateColumn), dateFormat).leq(date_format(lit(endDate), dateFormat));
        } else {
            condition = date_format(data.col(dateColumn), dateFormat).geq(date_format(lit(startDate), dateFormat));
        }

        return data.filter(condition);
    }

    @InsightComponent(name = "去重", description = "去重")
    public static  Dataset distinct(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data) {
        return data.distinct();
    }

    private static boolean checkNull(String s) {
        return s == null || s.trim().length() == 0;
    }


    @InsightComponent(name = "checkpoint", description = "checkpoint")
    public static  Dataset checkpoint(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "路径", description = "路径", defaultValue = "${MISC_FOLDER}") String path) {
        SparkSession spark = SparkContextBuilder.getSession();
        spark.sparkContext().setCheckpointDir(path + "/checkpoint");
        return data.checkpoint();
    }

}