All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.insight.ml.spark.data.dataset.DataSetLoader Maven / Gradle / Ivy

package com.datastax.insight.ml.spark.data.dataset;

import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.core.entity.Cache;
import com.datastax.insight.core.entity.DBSource;
import com.datastax.insight.ml.spark.ml.feature.transformer.NormalizerWrapper;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.core.service.PersistService;
import org.apache.parquet.Strings;
import org.apache.spark.dataset.DataCsvtLoader;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import javax.validation.constraints.NotNull;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.Properties;

public class DataSetLoader implements Operator {
    private static final String LINE_SEPARATOR=System.getProperty("line.separator");


    public static Dataset file(String type,String path){
        if(type.equals("csv")){
            return csv(path);
        }else if(type.equals("text")){
            return text(path);
        }else if(type.equals("parquet")){
            return parquet(path);
        }else if(type.equals("json")){
            return json(path);
        }else if(type.equals("orc")){
            return orc(path);
        }else {
            return load(type,path);
        }
    }

    public static Dataset load(@NotNull String format,Boolean relativePath, @NotNull String path, Boolean header, String sep){
        SparkSession session= SparkContextBuilder.getSession();
        DataFrameReader reader = session.read();

        String datasetPath = path;
        if (relativePath) {
            Long userId = Long.parseLong(Cache.getCache("userId").toString());
            Map settings = getSettings(userId);
            String storeHome = settings.get("store.path");
            String datasetHome = settings.get("store.dataset");
            datasetPath = path.startsWith(getStorePath(storeHome, datasetHome))
                    ? path : getStorePath(getStorePath(storeHome, datasetHome), path);
        }

        if (format.equals("csv")) {
            if (sep == null || sep.isEmpty()) {
                sep = ",";
            }
            return DataCsvtLoader.loader(datasetPath,header,sep);
        }

        return reader.format(format).load(datasetPath);
    }

    public static String getDataPath(@NotNull String path){

        Long userId = Long.parseLong(Cache.getCache("userId").toString());
        Map settings = getSettings(userId);
        String storeHome = settings.get("store.path");
        String datasetHome = settings.get("store.dataset");
        String datasetPath = path.startsWith(getStorePath(storeHome, datasetHome))
                ? path : getStorePath(getStorePath(storeHome, datasetHome), path);

        return datasetPath;
    }

    public static Dataset load(@NotNull String format, @NotNull String path){
//        SparkSession session= SparkContextBuilder.getSession();
//        return session.read().format(format).load(path);
        return load(format,true, path, null, null);
    }

    public static Dataset text(String path){
        String[] paths=path.split(LINE_SEPARATOR);
        return text(paths);
    }

    public static Dataset text(String[] paths){
        SparkSession session= SparkContextBuilder.getSession();
        return session.read().text(paths);
    }

    public static Dataset parquet(String path){
        String[] paths=path.split(LINE_SEPARATOR);
        return parquet(paths);
    }

    public static Dataset parquet(String[] paths){
        SparkSession session= SparkContextBuilder.getSession();
        return session.read().parquet(paths);
    }

    public static Dataset csv(String path){
        String[] paths=path.split(LINE_SEPARATOR);
        return csv(paths);
    }

    public static Dataset csv(String[] paths){
        SparkSession session= SparkContextBuilder.getSession();
        return session.read().csv(paths);
    }

    public static Dataset json(String path){
        String[] paths=path.split(LINE_SEPARATOR);
        return json(paths);
    }

    public static Dataset json(String[] paths){
        SparkSession session= SparkContextBuilder.getSession();
        return session.read().json(paths);
    }

    public static Dataset orc(String path){
        String[] paths=path.split(LINE_SEPARATOR);
        return orc(paths);
    }

    public static Dataset orc(String[] paths){
        SparkSession session= SparkContextBuilder.getSession();
        return session.read().orc(paths);
    }

    public static Dataset jdbc(String dbsource,String table){
        SparkSession session= SparkContextBuilder.getSession();
        DBSource source = getDBSource(dbsource);
        Properties properties=new Properties();

        if(source != null) {
            properties.put("driver",source.getDriver());
            properties.put("user",source.getUser());
            properties.put("password",source.getPassword());
        }

        return session.read().jdbc(source.getUrl(),table,properties);
    }

    public static Dataset jdbc(String dbsource,String table,String columnName,
                                    long lowerBound,long upperBound,int numPartitions){
        SparkSession session= SparkContextBuilder.getSession();
        DBSource source = getDBSource(dbsource);
        Properties properties=new Properties();

        if(source != null) {
            properties.put("driver",source.getDriver());
            properties.put("user",source.getUser());
            properties.put("password",source.getPassword());
        }
        return session.read().jdbc(source.getUrl(),table,properties);
    }

    private static DBSource getDBSource(String id) {
        List dbSourceList = (List)Cache.getCache("dbsources");
        if(dbSourceList != null) {
            return dbSourceList.stream().filter(d->d.getId() == Long.parseLong(id)).
                    findFirst().orElse(null);
        }
        return null;
    }

    private static Map getSettings(Long userId) {
        Object result = PersistService.invoke("com.datastax.insight.agent.dao.InsightDAO",
                "getSystemSettings",
                new String[]{ Long.class.getTypeName() },
                new Object[]{ userId });

        return (Map) result;
    }

    private static String getStorePath(String storeHome, String storePath) {

        if(storeHome.toLowerCase().startsWith("hdfs://")) {
            if(storeHome.endsWith("/") && storePath.startsWith("/")) {
                return storeHome + storePath.substring(1);
            } else if (!storeHome.endsWith("/") && !storePath.startsWith("/")) {
                return storeHome + "/" + storePath;
            } else {
                return storeHome + storePath;
            }
        } else {
            return Paths.get(storeHome, storePath).toString();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy