com.datastax.insight.ml.spark.data.dataset.DataSetAction Maven / Gradle / Ivy
The newest version!
package com.datastax.insight.ml.spark.data.dataset;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.core.Consts;
import com.google.common.base.Strings;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.RelationalGroupedDataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.storage.StorageLevel;
import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class DataSetAction implements Operator {
@Actional
public static T[] collect(Dataset data) {
return (T[])data.collect();
}
@Actional
public static List collectAsList(Dataset data) {
return data.collectAsList();
}
@Actional
public static void show(Dataset data, Integer numRows, Integer truncate) {
if(numRows == null) {
data.show();
} else if(truncate == null){
data.show(numRows);
} else {
data.show(numRows, truncate);
}
}
@Actional
public static Dataset describe(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return data.describe(columns);
}
@Actional
public static T[] head(Dataset data, int n) {
return (T[])data.head(n);
}
@Actional
public static T first(Dataset data) {
return data.first();
}
@Actional
public static T[] take(Dataset data, int n) {
return (T[])data.take(n);
}
@Actional
public static long count(Dataset data) {
return data.count();
}
}
class DatasetBasic implements Operator {
@Transformal
public static Dataset toDF(Dataset data, @Nullable String cols) {
if(Strings.isNullOrEmpty(cols)) {
return data.toDF();
} else {
String[] columns = cols.split(Consts.DELIMITER);
return data.toDF(columns);
}
}
public StructType schema(Dataset data) {
return data.schema();
}
public static void printSchema(Dataset data) {
data.printSchema();
}
public static void explain(Dataset data, Boolean extended) {
if (extended == null) {
data.explain();
} else {
data.explain(extended);
}
}
public static Map dtypes(Dataset data) {
return Arrays.stream(data.dtypes()).collect(HashMap::new,
(mapping, info)->mapping.put(info._1.toString(), info._2.toString()),
HashMap::putAll);
}
public static String[] columns(Dataset data) {
return data.columns();
}
public static boolean isLocal(Dataset data) {
return data.isLocal();
}
public static Dataset checkpoint(Dataset data, Boolean eager) {
return eager == null ? data.checkpoint() : data.checkpoint(eager.booleanValue());
}
public static Dataset persist(Dataset data, String storageLevel) {
return Strings.isNullOrEmpty(storageLevel) ? data.persist() : data.persist(StorageLevel.fromString(storageLevel));
}
public static Dataset unpersist(Dataset data, Boolean blocking) {
return blocking == null ? data.unpersist() : data.unpersist(blocking);
}
public static RDD rdd(Dataset data) {
return data.rdd();
}
public static JavaRDD javaRDD(Dataset data) {
return data.javaRDD();
}
public static void createTempViewCommand(Dataset data, String viewName, Boolean replace, Boolean global) throws AnalysisException {
if(replace == null || replace.booleanValue() == false) {
if(global == null || global.booleanValue() == false) {
data.createTempView(viewName);
} else {
data.createGlobalTempView(viewName);
}
} else {
if(global == null || global.booleanValue() == false) {
data.createOrReplaceTempView(viewName);
} else {
// data.createOrReplaceGlobalTempView(viewName);
}
}
}
public static Dataset persist(Dataset data) {
return data.toJSON();
}
public static String[] inputFiles(Dataset data) {
return data.inputFiles();
}
}
class DatasetUntypedRel {
public static Dataset dropNA(Dataset data) {
return data.na().drop();
}
public static Dataset fillNA(Dataset data, String type, String value) {
if(type.equals(long.class.getTypeName()) || type.equals(Long.class.getTypeName())) {
return data.na().fill(Long.parseLong(value));
} else if(type.equals(double.class.getTypeName()) || type.equals(Double.class.getTypeName())) {
return data.na().fill(Double.parseDouble(value));
}
return data.na().fill(value);
}
public static double corr(Dataset data, String col1, String col2, String method) {
return Strings.isNullOrEmpty(method) ? data.stat().corr(col1, col2) : data.stat().corr(col1, col2, method);
}
public static double cov(Dataset data, String col1, String col2) {
return data.stat().cov(col1, col2);
}
public static Dataset crosstab(Dataset data, String col1, String col2) {
return data.stat().crosstab(col1, col2);
}
public static Dataset freqItems(Dataset data, String cols, Double support) {
String[] columns = cols.split(Consts.DELIMITER);
return support == null ? data.stat().freqItems(columns) : data.stat().freqItems(columns, support);
}
public static Dataset join(Dataset data, Dataset right, String cols, String joinType) {
List columns = Arrays.stream(cols.split(Consts.DELIMITER)).collect(Collectors.toList());
return data.join(right, scala.collection.JavaConversions.asScalaBuffer(columns).toSeq(), joinType);
}
public static Dataset crossJoin(Dataset data, Dataset right) {
return data.crossJoin(right);
}
public static Dataset sortWithinPartitions(Dataset data, String cols) {
List columns = Arrays.stream(cols.split(Consts.DELIMITER)).collect(Collectors.toList());
return data.sortWithinPartitions(scala.collection.JavaConversions.asScalaBuffer(columns).toSeq());
}
public static Dataset sort(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return columns.length == 1 ? data.sort(columns[0]) :
data.sort(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
}
public static Dataset orderBy(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return columns.length == 1 ? data.orderBy(columns[0]) :
data.orderBy(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
}
public static Dataset as(Dataset data, String alias) {
return data.as(alias);
}
public static Dataset select(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return columns.length == 1 ? data.select(columns[0]) :
data.select(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
}
public static Dataset selectExpr(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return data.selectExpr(columns);
}
public static Dataset filter(Dataset data, String conditionExpr) {
return data.filter(conditionExpr);
}
public static Dataset where(Dataset data, String conditionExpr) {
return data.where(conditionExpr);
}
public static RelationalGroupedDataset groupBy(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return columns.length == 1 ? data.groupBy(columns[0]) :
data.groupBy(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
}
public static RelationalGroupedDataset rollup(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return columns.length == 1 ? data.rollup(columns[0]) :
data.rollup(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
}
public static RelationalGroupedDataset cube(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return columns.length == 1 ? data.cube(columns[0]) :
data.cube(columns[0], Arrays.copyOfRange(columns, 1, columns.length));
}
public static Dataset agg(Dataset data, String aggExpr) {
String[] items = aggExpr.split(Consts.DELIMITER);
Map exprs = Arrays.stream(items).collect(HashMap::new,
(mapping, info) -> {
String[] spilts = info.split(Consts.DELIMITER);
mapping.put(spilts[0], spilts[1]);
},
HashMap::putAll);
return data.agg(exprs);
}
public static Dataset limit(Dataset data, int n) {
return data.limit(n);
}
public static Dataset union(Dataset data, Dataset right) {
return data.union(right);
}
public static Dataset intersect(Dataset data, Dataset right) {
return data.intersect(right);
}
public static Dataset except(Dataset data, Dataset right) {
return data.except(right);
}
public static Dataset sample(Dataset data, boolean withReplacement, double fraction, Long seed) {
return seed == null ? data.sample(withReplacement, fraction) : data.sample(withReplacement, fraction, seed);
}
public static Dataset[] randomSplit(Dataset data, String weights, Long seed) {
double[] weightsArray = Arrays.stream(weights.split(Consts.DELIMITER)).mapToDouble(w->Double.parseDouble(w)).toArray();
return seed == null ? data.randomSplit(weightsArray) : data.randomSplit(weightsArray, seed);
}
public static Dataset withColumnRenamed(Dataset data, String existingName, String newName) {
return data.withColumnRenamed(existingName, newName);
}
public static Dataset drop(Dataset data, String cols) {
String[] columns = cols.split(Consts.DELIMITER);
return data.drop(columns);
}
public static Dataset dropDuplicates(Dataset data, String cols) {
if(Strings.isNullOrEmpty(cols)) {
return data.dropDuplicates();
} else {
String[] columns = cols.split(Consts.DELIMITER);
return data.dropDuplicates(columns);
}
}
public static Dataset repartition(Dataset data, int numPartitions) {
return data.repartition(numPartitions);
}
public static Dataset coalesce(Dataset data, int numPartitions) {
return data.coalesce(numPartitions);
}
public static Dataset distinct(Dataset data) {
return data.distinct();
}
}
class DatasetStreaming{
public static boolean isStreaming(Dataset data) {
return data.isStreaming();
}
public static Dataset withWatermark(Dataset data, String eventTime, String delayThreshold) {
return data.withWatermark(eventTime, delayThreshold);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy