All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.data.prepare.spark.dataset.FillDataOperator Maven / Gradle / Ivy

package com.datastax.data.prepare.spark.dataset;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.data.prepare.spark.dataset.params.FillMissingValue;
import com.datastax.data.prepare.util.Consts;
import com.datastax.data.prepare.util.SharedMethods;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class FillDataOperator implements Operator {
    private static final Logger logger = LoggerFactory.getLogger(FillDataOperator.class);

    static  Dataset imputeMissingValues(Dataset data, FillMissingValue... fillMissingValues) {
        if( fillMissingValues.length == 0 || data.count() == 0) {
            logger.info("Detail parameter of Vacancy is empty or Dataset is empty");
            return data;
        }
        for(FillMissingValue fillMissingValue : fillMissingValues) {
            data = missingValuesFill(data, fillMissingValue, SharedMethods.attributeFilter(data, fillMissingValue.getAttributeSelector(), fillMissingValue.isInvertSelection(),
                    fillMissingValue.getAttribute(), fillMissingValue.getRegularExpression(), fillMissingValue.getValueType()));
        }
        return data;
    }
    //todo andy 未使用
    protected static  Dataset imputeMissingValues(Dataset data, List fillMissingValues) {
        return imputeMissingValues(data, fillMissingValues.toArray(new FillMissingValue[fillMissingValues.size()]));
    }
    //todo andy 未使用
    protected static  Dataset imputeMissingValues(Dataset data, String json) {
        if(json == null || "".equals(json)) {
            return data;
        }
        JSONArray array = JSON.parseArray(json);
        return imputeMissingValues(data, array);
    }

    @InsightComponent(name = "空缺值处理", type = "com.datastax.insight.dataprprocess.imputeMissingValues", description = "空缺值处理", order = 500701)
    public static  Dataset imputeMissingValues(
            @InsightComponentArg(externalInput = true, name = "data", description = "数据集") Dataset data,
            @InsightComponentArg(name = "参数", description = "填充空缺值详细参数") JSONArray array) {
        if(array.isEmpty()) {
            return data;
        }
        FillMissingValue fillMissingValue = new FillMissingValue();
        for(int i=0; i Dataset missingValuesFill(Dataset data, FillMissingValue fillMissingValue, StructField[] fields) {
        if(fields == null) {return data;}
        Map map = new HashMap<>();
        Row[] rows;
        if(Consts.DROPEMPTY.equals(fillMissingValue.getFillData())) {
            String[] temp = new String[fields.length];
            for(int i=0; i) data.na().drop(temp);
        }
        if(Consts.MINIMUM.equals(fillMissingValue.getFillData())) {
            for (StructField field : fields) {
                if (field.dataType() != DataTypes.CalendarIntervalType & field.dataType() != DataTypes.NullType) {
                    rows = (Row[]) data.agg(functions.min(data.col(field.name()))).collect();
                    if (rows[0].get(0) != null) {
                        map.put(field.name(), rows[0].get(0).toString());
                    }
                }
            }
        }
        if(Consts.MAXIMUM.equals(fillMissingValue.getFillData())) {
            for (StructField field : fields) {
                if (field.dataType() != DataTypes.CalendarIntervalType & field.dataType() != DataTypes.NullType) {
                    rows = (Row[]) data.agg(functions.max(data.col(field.name()))).collect();
                    if (rows[0].get(0) != null) {
                        map.put(field.name(), rows[0].get(0).toString());
                    }
                }
            }
        }
        if(Consts.ZERO.equals(fillMissingValue.getFillData())) {
            for (StructField field : fields) {
                if (field.dataType() != DataTypes.NullType & field.dataType() != DataTypes.CalendarIntervalType) {
                    map.put(field.name(), "0");
                }
            }
        }
        if(Consts.VALUE.equals(fillMissingValue.getFillData())) {
            for (StructField field : fields) {
                if (field.dataType() != DataTypes.NullType) {
                    if (fillMissingValue.getFillDataValue() != null && !"".equals(fillMissingValue.getFillDataValue())) {
                        map.put(field.name(), fillMissingValue.getFillDataValue());
                    }
                }
            }
        }
        if(Consts.AVERAGE.equals(fillMissingValue.getFillData())) {
            for (StructField field : fields) {
                if (field.dataType() != DataTypes.CalendarIntervalType & field.dataType() != DataTypes.DateType &
                        field.dataType() != DataTypes.TimestampType & field.dataType() != DataTypes.BooleanType &
                        field.dataType() != DataTypes.StringType & field.dataType() != DataTypes.NullType &
                        field.dataType() != DataTypes.BinaryType) {
                    rows = (Row[]) data.agg(functions.avg(data.col(field.name()))).collect();
                    if (rows[0].get(0) != null) {
                        map.put(field.name(), rows[0].get(0).toString());
                    }
                }
            }
        }
        if(!map.isEmpty()) {
            data = (Dataset) data.na().fill(map);
        }
        map.clear();
        return data;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy