com.datastax.data.exploration.common.SparkOperator Maven / Gradle / Ivy
package com.datastax.data.exploration.common;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.javatuples.Pair;
import org.javatuples.Triplet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
/**
* 数据探索中Spark操作
*/
public class SparkOperator {
private static final Logger logger = LoggerFactory.getLogger(SparkOperator.class);
/**
* 读取数据集
* @param title 数据集是否包含标题
* @param address 数据集位置
* @param limit 数据集最大展示记录数
* @return 数据集记录数,标题,数据
*/
public static Triplet> getData(boolean title, String address, long limit) {
SparkSession sparkSession = SparkSession.builder()
.appName("datastax-insight")
.master("local[*]")
.getOrCreate();
try {
Dataset dataset = getDataset(sparkSession, title, address);
long count = dataset.count();
if (count < limit) {
String[] columns = dataset.columns();
List listRow = dataset.collectAsList();
return new Triplet<>(count, columns, listRow);
}
return new Triplet<>(count, null, null);
} catch (Exception e) {
return new Triplet<>(0L, null, null);
} finally {
sparkSession.stop();
logger.info(sparkSession.hashCode() + " sparkSession stop!");
}
}
/**
* 数据集抽样
* @param title 数据集是否包含标题
* @param address 数据集位置
* @param size 数据集抽样记录数
* @return 标题,数据
*/
public static Pair> sample(boolean title, String address, int size) {
SparkSession sparkSession = SparkSession.builder()
.appName("datastax-insight")
.master("local[*]")
.getOrCreate();
try {
Dataset dataset = getDataset(sparkSession, title, address);
String[] columns = dataset.columns();
List listRow = dataset.javaRDD().takeSample(false, size);
return new Pair<>(columns, listRow);
} catch (Exception e) {
return new Pair<>(null, null);
} finally {
sparkSession.stop();
logger.info(sparkSession.hashCode() + " sparkSession stop!");
}
}
private static Dataset getDataset(SparkSession sparkSession, boolean title, String address) {
Dataset dataset;
if (title) {
dataset = sparkSession.read().option("header", "true").csv(address);
} else {
dataset = sparkSession.read().csv(address);
}
if(dataset == null) logger.info("read dataset error!");
return dataset;
}
public static Pair> getData(boolean title, String address) {
SparkSession sparkSession = SparkSession.builder()
.appName("datastax-insight")
.master("local[*]")
.getOrCreate();
try {
Dataset dataset = getDataset(sparkSession, title, address);
String[] columns = dataset.columns();
List listRow = dataset.collectAsList();
return new Pair<>(columns, listRow);
} catch (Exception e) {
return new Pair<>(null, null);
} finally {
sparkSession.stop();
logger.info(sparkSession.hashCode() + " sparkSession stop!");
}
}
}