All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.data.prepare.spark.dataset.ReplaceOperator Maven / Gradle / Ivy

package com.datastax.data.prepare.spark.dataset;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.data.prepare.spark.dataset.params.FilterSection;
import com.datastax.data.prepare.spark.dataset.params.ReplaceAttribute;
import com.datastax.data.prepare.util.Consts;
import com.datastax.data.prepare.util.CustomException;
import com.datastax.data.prepare.util.SharedMethods;
import org.apache.parquet.Strings;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ReplaceOperator implements Operator {
    private static final Logger logger = LoggerFactory.getLogger(ReplaceOperator.class);

    @InsightComponent(name = "Replace", description = "替换数据集中的值")
    public static  Dataset replace(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
            @InsightComponentArg(name = "参数", description = "参数") JSONObject object) {
        if(object.isEmpty()) {
            logger.info("Replace组件参数为空, 返回原数据集");
            return data;
        }
        if(data == null) {
            logger.info("Replace组件中的数据集为空, 返回空");
            return null;
        }
        String replaceType = object.getString("selector");
        JSONArray array = object.getJSONArray("selectorValue");

        List replaceAttributes = new ArrayList<>();
        for(int i=0; i Dataset replace(Dataset data, String replaceType, List replaceAttributes) {
        Map> map = new HashMap<>();
        StructField[] schema = data.schema().fields();
        for(ReplaceAttribute replaceAttribute : replaceAttributes) {
            StructField[] fields = SharedMethods.attributeFilter(data, replaceAttribute.getAttributeSelector(), replaceAttribute.isInvertSelection(),
                    replaceAttribute.getAttribute(), replaceAttribute.getRegularExpression(), replaceAttribute.getValueType());
            if(fields == null) {
                logger.info("Replace组件中选择列的结果为空,跳过该参数");
                continue;
            }
            for(StructField field : fields) {
                if(map.containsKey(field)) {
                    map.get(field).add(replaceAttribute);
                } else {
                    List list = new ArrayList<>();
                    list.add(replaceAttribute);
                    map.put(field, list);
                }
            }
        }

        Map totalSchema = new HashMap<>();
        SharedMethods.recordSchema(schema, totalSchema);
        JavaRDD javaRDD = data.toDF().javaRDD().map(new Function() {
            @Override
            public Row call(Row row) throws Exception {
                String[] strings = new String[row.length()];
                for(int i=0; i) SparkContextBuilder.getSession().createDataFrame(javaRDD, structType);
    }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy