
com.datastax.data.prepare.spark.dataset.ReplaceOperator Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.data.prepare.spark.dataset.params.FilterSection;
import com.datastax.data.prepare.spark.dataset.params.ReplaceAttribute;
import com.datastax.data.prepare.util.Consts;
import com.datastax.data.prepare.util.CustomException;
import com.datastax.data.prepare.util.SharedMethods;
import org.apache.parquet.Strings;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ReplaceOperator implements Operator {
private static final Logger logger = LoggerFactory.getLogger(ReplaceOperator.class);
@InsightComponent(name = "Replace", description = "替换数据集中的值")
public static Dataset replace(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集") Dataset data,
@InsightComponentArg(name = "参数", description = "参数") JSONObject object) {
if(object.isEmpty()) {
logger.info("Replace组件参数为空, 返回原数据集");
return data;
}
if(data == null) {
logger.info("Replace组件中的数据集为空, 返回空");
return null;
}
String replaceType = object.getString("selector");
JSONArray array = object.getJSONArray("selectorValue");
List replaceAttributes = new ArrayList<>();
for(int i=0; i Dataset replace(Dataset data, String replaceType, List replaceAttributes) {
Map> map = new HashMap<>();
StructField[] schema = data.schema().fields();
for(ReplaceAttribute replaceAttribute : replaceAttributes) {
StructField[] fields = SharedMethods.attributeFilter(data, replaceAttribute.getAttributeSelector(), replaceAttribute.isInvertSelection(),
replaceAttribute.getAttribute(), replaceAttribute.getRegularExpression(), replaceAttribute.getValueType());
if(fields == null) {
logger.info("Replace组件中选择列的结果为空,跳过该参数");
continue;
}
for(StructField field : fields) {
if(map.containsKey(field)) {
map.get(field).add(replaceAttribute);
} else {
List list = new ArrayList<>();
list.add(replaceAttribute);
map.put(field, list);
}
}
}
Map totalSchema = new HashMap<>();
SharedMethods.recordSchema(schema, totalSchema);
JavaRDD javaRDD = data.toDF().javaRDD().map(new Function() {
@Override
public Row call(Row row) throws Exception {
String[] strings = new String[row.length()];
for(int i=0; i) SparkContextBuilder.getSession().createDataFrame(javaRDD, structType);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy