com.datastax.data.prepare.spark.dataset.DataSetLoader Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset;
import com.alibaba.fastjson.JSONArray;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.core.entity.Cache;
import com.datastax.insight.core.entity.DBSource;
import com.datastax.insight.spec.Operator;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.collection.JavaConversions;
import java.util.List;
import java.util.Properties;
public class DataSetLoader implements Operator {
private static final String LINE_SEPARATOR=System.getProperty("line.separator");
public static Dataset file(String type,String path){
switch (type) {
case "csv":
return csv(path);
case "text":
return text(path);
case "parquet":
return parquet(path);
case "json":
return json(path);
case "orc":
return orc(path);
default:
return load(type, path);
}
}
public static Dataset load(String format,String path){
SparkSession session= SparkContextBuilder.getSession();
return session.read().format(format).load(path);
}
public static Dataset text(String path){
String[] paths=path.split(LINE_SEPARATOR);
return text(paths);
}
public static Dataset text(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().text(paths);
}
public static Dataset parquet(String path){
String[] paths=path.split(LINE_SEPARATOR);
return parquet(paths);
}
public static Dataset parquet(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().parquet(paths);
}
public static Dataset csv(String path){
String[] paths=path.split(LINE_SEPARATOR);
return csv(paths);
}
public static Dataset csv(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().csv(paths);
}
public static Dataset json(String path){
String[] paths=path.split(LINE_SEPARATOR);
return json(paths);
}
public static Dataset json(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().json(paths);
}
public static Dataset orc(String path){
String[] paths=path.split(LINE_SEPARATOR);
return orc(paths);
}
public static Dataset orc(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().orc(paths);
}
public static Dataset jdbc(String dbsource,String table){
SparkSession session= SparkContextBuilder.getSession();
DBSource source = getDBSource(dbsource);
Properties properties=new Properties();
if(source != null) {
properties.put("driver",source.getDriver());
properties.put("user",source.getUser());
properties.put("password",source.getPassword());
}
return session.read().jdbc(source.getUrl(),table,properties);
}
public static Dataset jdbc(String dbsource,String table,String columnName,
long lowerBound,long upperBound,int numPartitions){
return jdbc(dbsource, table);
}
private static DBSource getDBSource(String id) {
List dbSourceList = (List)Cache.getCache("dbsources");
if(dbSourceList != null) {
return dbSourceList.stream().filter(d->d.getId() == Integer.parseInt(id)).
findFirst().orElse(null);
}
return null;
}
@InsightComponent(name = "string2dataset", description = "string2dataset")
public static Dataset arrayToDataset(
@InsightComponentArg(name = "数据集", description = "数据集") String data,
@InsightComponentArg(name = "表头", description = "表头") String columns) {
List listRow = JSONArray.parseArray(data,String.class);
return (Dataset) BasicOperation.createData(JavaConversions.asScalaBuffer(listRow),columns);
}
}