com.datastax.data.prepare.spark.dataset.FillDataOperator Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.data.prepare.spark.dataset.params.FillMissingValue;
import com.datastax.data.prepare.util.Consts;
import com.datastax.data.prepare.util.SharedMethods;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class FillDataOperator implements Operator {
private static final Logger logger = LoggerFactory.getLogger(FillDataOperator.class);
static Dataset imputeMissingValues(Dataset data, FillMissingValue... fillMissingValues) {
if( fillMissingValues.length == 0 || data.count() == 0) {
logger.info("Detail parameter of Vacancy is empty or Dataset is empty");
return data;
}
for(FillMissingValue fillMissingValue : fillMissingValues) {
data = missingValuesFill(data, fillMissingValue, SharedMethods.attributeFilter(data, fillMissingValue.getAttributeSelector(), fillMissingValue.isInvertSelection(),
fillMissingValue.getAttribute(), fillMissingValue.getRegularExpression(), fillMissingValue.getValueType()));
}
return data;
}
//todo andy 未使用
protected static Dataset imputeMissingValues(Dataset data, List fillMissingValues) {
return imputeMissingValues(data, fillMissingValues.toArray(new FillMissingValue[fillMissingValues.size()]));
}
//todo andy 未使用
protected static Dataset imputeMissingValues(Dataset data, String json) {
if(json == null || "".equals(json)) {
return data;
}
JSONArray array = JSON.parseArray(json);
return imputeMissingValues(data, array);
}
@InsightComponent(name = "空缺值处理", type = "com.datastax.insight.dataprprocess.imputeMissingValues", description = "空缺值处理", order = 500701)
public static Dataset imputeMissingValues(
@InsightComponentArg(externalInput = true, name = "data", description = "数据集") Dataset data,
@InsightComponentArg(name = "参数", description = "填充空缺值详细参数") JSONArray array) {
if(array.isEmpty()) {
return data;
}
FillMissingValue fillMissingValue = new FillMissingValue();
for(int i=0; i Dataset missingValuesFill(Dataset data, FillMissingValue fillMissingValue, StructField[] fields) {
if(fields == null) {return data;}
Map map = new HashMap<>();
Row[] rows;
if(Consts.DROPEMPTY.equals(fillMissingValue.getFillData())) {
String[] temp = new String[fields.length];
for(int i=0; i) data.na().drop(temp);
}
if(Consts.MINIMUM.equals(fillMissingValue.getFillData())) {
for (StructField field : fields) {
if (field.dataType() != DataTypes.CalendarIntervalType & field.dataType() != DataTypes.NullType) {
rows = (Row[]) data.agg(functions.min(data.col(field.name()))).collect();
if (rows[0].get(0) != null) {
map.put(field.name(), rows[0].get(0).toString());
}
}
}
}
if(Consts.MAXIMUM.equals(fillMissingValue.getFillData())) {
for (StructField field : fields) {
if (field.dataType() != DataTypes.CalendarIntervalType & field.dataType() != DataTypes.NullType) {
rows = (Row[]) data.agg(functions.max(data.col(field.name()))).collect();
if (rows[0].get(0) != null) {
map.put(field.name(), rows[0].get(0).toString());
}
}
}
}
if(Consts.ZERO.equals(fillMissingValue.getFillData())) {
for (StructField field : fields) {
if (field.dataType() != DataTypes.NullType & field.dataType() != DataTypes.CalendarIntervalType) {
map.put(field.name(), "0");
}
}
}
if(Consts.VALUE.equals(fillMissingValue.getFillData())) {
for (StructField field : fields) {
if (field.dataType() != DataTypes.NullType) {
if (fillMissingValue.getFillDataValue() != null && !"".equals(fillMissingValue.getFillDataValue())) {
map.put(field.name(), fillMissingValue.getFillDataValue());
}
}
}
}
if(Consts.AVERAGE.equals(fillMissingValue.getFillData())) {
for (StructField field : fields) {
if (field.dataType() != DataTypes.CalendarIntervalType & field.dataType() != DataTypes.DateType &
field.dataType() != DataTypes.TimestampType & field.dataType() != DataTypes.BooleanType &
field.dataType() != DataTypes.StringType & field.dataType() != DataTypes.NullType &
field.dataType() != DataTypes.BinaryType) {
rows = (Row[]) data.agg(functions.avg(data.col(field.name()))).collect();
if (rows[0].get(0) != null) {
map.put(field.name(), rows[0].get(0).toString());
}
}
}
}
if(!map.isEmpty()) {
data = (Dataset) data.na().fill(map);
}
map.clear();
return data;
}
}