com.datastax.insight.ml.spark.data.dataset.DataSetLoader Maven / Gradle / Ivy
package com.datastax.insight.ml.spark.data.dataset;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.core.entity.Cache;
import com.datastax.insight.core.entity.DBSource;
import com.datastax.insight.ml.spark.ml.feature.transformer.NormalizerWrapper;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.core.service.PersistService;
import org.apache.parquet.Strings;
import org.apache.spark.dataset.DataCsvtLoader;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import javax.validation.constraints.NotNull;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.Properties;
public class DataSetLoader implements Operator {
private static final String LINE_SEPARATOR=System.getProperty("line.separator");
public static Dataset file(String type,String path){
if(type.equals("csv")){
return csv(path);
}else if(type.equals("text")){
return text(path);
}else if(type.equals("parquet")){
return parquet(path);
}else if(type.equals("json")){
return json(path);
}else if(type.equals("orc")){
return orc(path);
}else {
return load(type,path);
}
}
public static Dataset load(@NotNull String format,Boolean relativePath, @NotNull String path, Boolean header, String sep){
SparkSession session= SparkContextBuilder.getSession();
DataFrameReader reader = session.read();
String datasetPath = path;
if (relativePath) {
Long userId = Long.parseLong(Cache.getCache("userId").toString());
Map settings = getSettings(userId);
String storeHome = settings.get("store.path");
String datasetHome = settings.get("store.dataset");
datasetPath = path.startsWith(getStorePath(storeHome, datasetHome))
? path : getStorePath(getStorePath(storeHome, datasetHome), path);
}
if (format.equals("csv")) {
if (sep == null || sep.isEmpty()) {
sep = ",";
}
return DataCsvtLoader.loader(datasetPath,header,sep);
}
return reader.format(format).load(datasetPath);
}
public static String getDataPath(@NotNull String path){
Long userId = Long.parseLong(Cache.getCache("userId").toString());
Map settings = getSettings(userId);
String storeHome = settings.get("store.path");
String datasetHome = settings.get("store.dataset");
String datasetPath = path.startsWith(getStorePath(storeHome, datasetHome))
? path : getStorePath(getStorePath(storeHome, datasetHome), path);
return datasetPath;
}
public static Dataset load(@NotNull String format, @NotNull String path){
// SparkSession session= SparkContextBuilder.getSession();
// return session.read().format(format).load(path);
return load(format,true, path, null, null);
}
public static Dataset text(String path){
String[] paths=path.split(LINE_SEPARATOR);
return text(paths);
}
public static Dataset text(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().text(paths);
}
public static Dataset parquet(String path){
String[] paths=path.split(LINE_SEPARATOR);
return parquet(paths);
}
public static Dataset parquet(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().parquet(paths);
}
public static Dataset csv(String path){
String[] paths=path.split(LINE_SEPARATOR);
return csv(paths);
}
public static Dataset csv(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().csv(paths);
}
public static Dataset json(String path){
String[] paths=path.split(LINE_SEPARATOR);
return json(paths);
}
public static Dataset json(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().json(paths);
}
public static Dataset orc(String path){
String[] paths=path.split(LINE_SEPARATOR);
return orc(paths);
}
public static Dataset orc(String[] paths){
SparkSession session= SparkContextBuilder.getSession();
return session.read().orc(paths);
}
public static Dataset jdbc(String dbsource,String table){
SparkSession session= SparkContextBuilder.getSession();
DBSource source = getDBSource(dbsource);
Properties properties=new Properties();
if(source != null) {
properties.put("driver",source.getDriver());
properties.put("user",source.getUser());
properties.put("password",source.getPassword());
}
return session.read().jdbc(source.getUrl(),table,properties);
}
public static Dataset jdbc(String dbsource,String table,String columnName,
long lowerBound,long upperBound,int numPartitions){
SparkSession session= SparkContextBuilder.getSession();
DBSource source = getDBSource(dbsource);
Properties properties=new Properties();
if(source != null) {
properties.put("driver",source.getDriver());
properties.put("user",source.getUser());
properties.put("password",source.getPassword());
}
return session.read().jdbc(source.getUrl(),table,properties);
}
private static DBSource getDBSource(String id) {
List dbSourceList = (List)Cache.getCache("dbsources");
if(dbSourceList != null) {
return dbSourceList.stream().filter(d->d.getId() == Long.parseLong(id)).
findFirst().orElse(null);
}
return null;
}
private static Map getSettings(Long userId) {
Object result = PersistService.invoke("com.datastax.insight.agent.dao.InsightDAO",
"getSystemSettings",
new String[]{ Long.class.getTypeName() },
new Object[]{ userId });
return (Map) result;
}
private static String getStorePath(String storeHome, String storePath) {
if(storeHome.toLowerCase().startsWith("hdfs://")) {
if(storeHome.endsWith("/") && storePath.startsWith("/")) {
return storeHome + storePath.substring(1);
} else if (!storeHome.endsWith("/") && !storePath.startsWith("/")) {
return storeHome + "/" + storePath;
} else {
return storeHome + storePath;
}
} else {
return Paths.get(storeHome, storePath).toString();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy