All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.insight.ml.spark.data.dataset.DataSetAction Maven / Gradle / Ivy

package com.datastax.insight.ml.spark.data.dataset;

import com.datastax.insight.spec.Operator;
import com.datastax.insight.core.Consts;
import com.google.common.base.Strings;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.RelationalGroupedDataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.storage.StorageLevel;

import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class DataSetAction implements Operator {

    @Actional
    public static  T[] collect(Dataset data) {
        return (T[])data.collect();
    }

    @Actional
    public static  List collectAsList(Dataset data) {
        return data.collectAsList();
    }

    @Actional
    public static void show(Dataset data, Integer numRows, Integer truncate) {
        if(numRows == null) {
            data.show();
        } else if(truncate == null){
            data.show(numRows);
        } else {
            data.show(numRows, truncate);
        }
    }

    @Actional
    public static Dataset describe(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return data.describe(columns);
    }

    @Actional
    public static  T[] head(Dataset data, int n) {
        return (T[])data.head(n);
    }

    @Actional
    public static  T first(Dataset data) {
        return data.first();
    }

    @Actional
    public static  T[] take(Dataset data, int n) {
        return (T[])data.take(n);
    }

    @Actional
    public static long count(Dataset data) {
        return data.count();
    }
}


class DatasetBasic implements Operator {

    @Transformal
    public static Dataset toDF(Dataset data, @Nullable String cols) {
        if(Strings.isNullOrEmpty(cols)) {
            return data.toDF();
        } else {
            String[] columns = cols.split(Consts.DELIMITER);
            return data.toDF(columns);
        }
    }

    public StructType schema(Dataset data) {
        return data.schema();
    }

    public static void printSchema(Dataset data) {
        data.printSchema();
    }

    public static void explain(Dataset data, Boolean extended) {
        if (extended == null) {
            data.explain();
        } else {
            data.explain(extended);
        }
    }

    public static Map dtypes(Dataset data) {
        return Arrays.stream(data.dtypes()).collect(HashMap::new,
                (mapping, info)->mapping.put(info._1.toString(), info._2.toString()),
                HashMap::putAll);
    }

    public static String[] columns(Dataset data) {
        return data.columns();
    }

    public static boolean isLocal(Dataset data) {
        return data.isLocal();
    }

    public static  Dataset checkpoint(Dataset data, Boolean eager) {
        return eager == null ? data.checkpoint() : data.checkpoint(eager.booleanValue());
    }

    public static Dataset persist(Dataset data, String storageLevel) {
        return Strings.isNullOrEmpty(storageLevel) ? data.persist() : data.persist(StorageLevel.fromString(storageLevel));
    }

    public static Dataset unpersist(Dataset data, Boolean blocking) {
        return blocking == null ? data.unpersist() : data.unpersist(blocking);
    }

    public static  RDD rdd(Dataset data) {
        return data.rdd();
    }

    public static  JavaRDD javaRDD(Dataset data) {
        return data.javaRDD();
    }

    public static void createTempViewCommand(Dataset data, String viewName, Boolean replace, Boolean global) throws AnalysisException {
        if(replace == null || replace.booleanValue() == false) {
            if(global == null || global.booleanValue() == false) {
                data.createTempView(viewName);
            } else {
                data.createGlobalTempView(viewName);
            }
        } else {
            if(global == null || global.booleanValue() == false) {
                data.createOrReplaceTempView(viewName);
            } else {
//                data.createOrReplaceGlobalTempView(viewName);
            }
        }
    }

    public static  Dataset persist(Dataset data) {
        return data.toJSON();
    }

    public static  String[] inputFiles(Dataset data) {
        return data.inputFiles();
    }
}

class DatasetUntypedRel {

    public static Dataset dropNA(Dataset data) {
        return data.na().drop();
    }

    public static Dataset fillNA(Dataset data, String type, String value) {

        if(type.equals(long.class.getTypeName()) || type.equals(Long.class.getTypeName())) {
            return data.na().fill(Long.parseLong(value));
        } else if(type.equals(double.class.getTypeName()) || type.equals(Double.class.getTypeName())) {
            return data.na().fill(Double.parseDouble(value));
        }

        return data.na().fill(value);
    }

    public static double corr(Dataset data, String col1, String col2, String method) {
        return Strings.isNullOrEmpty(method) ? data.stat().corr(col1, col2) : data.stat().corr(col1, col2, method);
    }

    public static double cov(Dataset data, String col1, String col2) {
        return data.stat().cov(col1, col2);
    }

    public static Dataset crosstab(Dataset data, String col1, String col2) {
        return data.stat().crosstab(col1, col2);
    }

    public static Dataset freqItems(Dataset data, String cols, Double support) {
        String[] columns = cols.split(Consts.DELIMITER);
        return support == null ? data.stat().freqItems(columns) : data.stat().freqItems(columns, support);
    }

    public static Dataset join(Dataset data, Dataset right, String cols, String joinType) {
        List columns = Arrays.stream(cols.split(Consts.DELIMITER)).collect(Collectors.toList());
        return data.join(right, scala.collection.JavaConversions.asScalaBuffer(columns).toSeq(), joinType);
    }

    public static Dataset crossJoin(Dataset data, Dataset right) {
        return data.crossJoin(right);
    }

    public static Dataset sortWithinPartitions(Dataset data, String cols) {
        List columns = Arrays.stream(cols.split(Consts.DELIMITER)).collect(Collectors.toList());
        return data.sortWithinPartitions(scala.collection.JavaConversions.asScalaBuffer(columns).toSeq());
    }

    public static Dataset sort(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return columns.length == 1 ? data.sort(columns[0]) :
                data.sort(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
    }

    public static Dataset orderBy(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return columns.length == 1 ? data.orderBy(columns[0]) :
                data.orderBy(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
    }

    public static Dataset as(Dataset data, String alias) {
        return data.as(alias);
    }

    public static Dataset select(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return columns.length == 1 ? data.select(columns[0]) :
                data.select(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
    }

    public static Dataset selectExpr(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return data.selectExpr(columns);
    }

    public static Dataset filter(Dataset data, String conditionExpr) {
        return data.filter(conditionExpr);
    }

    public static Dataset where(Dataset data, String conditionExpr) {
        return data.where(conditionExpr);
    }

    public static RelationalGroupedDataset groupBy(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return columns.length == 1 ? data.groupBy(columns[0]) :
                data.groupBy(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
    }

    public static RelationalGroupedDataset rollup(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return columns.length == 1 ? data.rollup(columns[0]) :
                data.rollup(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
    }

    public static RelationalGroupedDataset cube(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return columns.length == 1 ? data.cube(columns[0]) :
                data.cube(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
    }

    public static Dataset agg(Dataset data, String aggExpr) {
        String[] items = aggExpr.split(Consts.DELIMITER);
        Map exprs = Arrays.stream(items).collect(HashMap::new,
                (mapping, info) -> {
                    String[] spilts = info.split(Consts.DELIMITER);
                    mapping.put(spilts[0], spilts[1]);
                },
                HashMap::putAll);
        return data.agg(exprs);
    }

    public static Dataset limit(Dataset data, int n) {
        return data.limit(n);
    }

    public static Dataset union(Dataset data, Dataset right) {
        return data.union(right);
    }

    public static Dataset intersect(Dataset data, Dataset right) {
        return data.intersect(right);
    }

    public static Dataset except(Dataset data, Dataset right) {
        return data.except(right);
    }

    public static Dataset sample(Dataset data, boolean withReplacement, double fraction, Long seed) {
        return seed == null ? data.sample(withReplacement, fraction) : data.sample(withReplacement, fraction, seed);
    }

    public static Dataset[] randomSplit(Dataset data, String weights, Long seed) {
        double[] weightsArray = Arrays.stream(weights.split(Consts.DELIMITER)).mapToDouble(w->Double.parseDouble(w)).toArray();
        return seed == null ? data.randomSplit(weightsArray) : data.randomSplit(weightsArray, seed);
    }

    public static Dataset withColumnRenamed(Dataset data, String existingName, String newName) {
        return data.withColumnRenamed(existingName, newName);
    }

    public static Dataset drop(Dataset data, String cols) {
        String[] columns = cols.split(Consts.DELIMITER);
        return data.drop(columns);
    }

    public static Dataset dropDuplicates(Dataset data, String cols) {
        if(Strings.isNullOrEmpty(cols)) {
            return data.dropDuplicates();
        } else {
            String[] columns = cols.split(Consts.DELIMITER);
            return data.dropDuplicates(columns);
        }
    }

    public static Dataset repartition(Dataset data, int numPartitions) {
        return data.repartition(numPartitions);
    }

    public static Dataset coalesce(Dataset data, int numPartitions) {
        return data.coalesce(numPartitions);
    }

    public static Dataset distinct(Dataset data) {
        return data.distinct();
    }
}

class DatasetStreaming{

    public static boolean isStreaming(Dataset data) {
        return data.isStreaming();
    }

    public static Dataset withWatermark(Dataset data, String eventTime, String delayThreshold) {
        return data.withWatermark(eventTime, delayThreshold);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy