com.datastax.data.prepare.spark.dataset.DataSetWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of data-analysis Show documentation
data-analysis
The newest version!
package com.datastax.data.prepare.spark.dataset;

import com.datastax.data.prepare.spark.dataset.params.Aggregate;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.core.entity.Cache;
import com.datastax.insight.core.entity.DBSource;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.core.service.PersistService;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.Dataset;
import org.apache.spark.storage.StorageLevel;

import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Properties;

public class DataSetWriter implements Operator {
    public static  void save(Dataset dataset, String format, String mode, String path) throws Exception {
        save(dataset, format, mode, path, true);
    }

    public static  void save(Dataset dataset, String format, String mode, String path, boolean withHeader) throws Exception {

        Configuration conf = new Configuration();
        FileSystem hdfs = FileSystem.get(new URI(path), conf);
        String temp = new Path(hdfs.getWorkingDirectory(), "temp_" + String.valueOf(System.currentTimeMillis())).toString();
        DataFrameWriter writer = dataset.write();

        if(withHeader) {
            writer.option("header", true);
        }

        writer.format(format).mode(mode).save(temp);

        if(hdfs.exists(new Path(path))) {
            hdfs.delete(new Path(path), true);
        }

        FileUtil.copyMerge(hdfs, new Path(temp), hdfs, new Path(path), true, conf, null);

        PersistService.invoke("com.datastax.insight.agent.dao.InsightDAO",
                "saveFlowResult",
                new String[]{Integer.class.getTypeName(), String.class.getTypeName()},
                new Object[]{PersistService.getFlowId(), path});
    }

    public static  void jdbc(Dataset dataset, String mode, DBSource dbSource,String table) {
        if(dbSource!=null) {
            Properties properties=new Properties();
            properties.put("driver",dbSource.getDriver());
            properties.put("user",dbSource.getUser());
            properties.put("password",dbSource.getPassword());

            //System.out.println(dbSource.getUrl()+"==>"+dbSource.getUser());
            dataset.write().mode(mode).jdbc(dbSource.getUrl(), table, properties);
        }
    }

    public static  void jdbc(Dataset dataset, String mode, String dbID,String table) {
        DBSource dbSource=getDBSource(dbID);
        jdbc(dataset,mode,dbSource,table);
    }

    public static  void jdbc(Dataset dataset, String mode,String url, String driver,String user,String password,String table) {
        Properties properties=new Properties();
        properties.put("driver",driver);
        properties.put("user",user);
        properties.put("password",password);

        if ("overwrite".equals(mode)) {
            dataset.write().mode(mode)
                    .option("truncate",true)
                    .option("batchsize",1000)
                    .jdbc(url, table, properties);
        } else {
            dataset.write().mode(mode)
                    .option("batchsize",1000)
                    .jdbc(url, table, properties);
        }

    }

    private static DBSource getDBSource(String id) {
        List dbSourceList = (List)Cache.getCache("dbsources");
        if(dbSourceList != null) {
            return dbSourceList.stream().filter(d->d.getId() == Integer.parseInt(id)).
                    findFirst().orElse(null);
        }
        return null;
    }

    public static  void incrementDataSave(Dataset dataset, String format, String mode, String path,String fileName, boolean withHeader) {
        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMdd");
        String nowDay = dateFormat.format(new Date());
        String rootPath = path.endsWith("/") ? path : path + "/";
        String savePath = rootPath + nowDay + "/" + fileName;
        DataFrameWriter writer = dataset.write();
        if(withHeader) {
            writer.option("header", true);
        }
        writer.format(format).mode(mode).save(savePath);
    }


    @InsightComponent( name = "checkPoint", description = "checkPoint" )
    public static  Dataset checkPoint(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data,
            @InsightComponentArg(name = "checkPoint地址", description = "checkPoint地址") String address) {
        data.persist(StorageLevel.MEMORY_AND_DISK());
        SparkContextBuilder.getContext().setCheckpointDir(address);
        data.checkpoint();
        return data;
    }

    @InsightComponent( name = "删除hdfs文件", description = "删除hdfs文件" )
    public static  Dataset deleteHdfsFile(
            @InsightComponentArg(name = "hdfs文件地址", description = "hdfs文件地址",request = true) String hdfsFile) {
        try {
            FileSystem hdfs = FileSystem.get(new URI(hdfsFile),new Configuration());
            Path path = new Path(hdfsFile);
            if (hdfs.exists(path)) {
                hdfs.delete(path,true);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    @InsightComponent( name = "Dataset Persist", description = "Dataset Persist" )
    public static  Dataset persist(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data,
            @InsightComponentArg(name = "缓存方式", description = "缓存方式", items = "DISK_ONLY;DISK_ONLY_2;MEMORY_ONLY;MEMORY_ONLY_2;MEMORY_ONLY_SER;MEMORY_ONLY_SER_2;MEMORY_AND_DISK;MEMORY_AND_DISK_2;MEMORY_AND_DISK_SER;MEMORY_AND_DISK_SER_2") String persistMode) {
        StorageLevel mode = null;
        switch (persistMode) {
            case "DISK_ONLY": mode = StorageLevel.DISK_ONLY();break;
            case "DISK_ONLY_2": mode = StorageLevel.DISK_ONLY_2();break;
            case "MEMORY_ONLY": mode = StorageLevel.MEMORY_ONLY();break;
            case "MEMORY_ONLY_2": mode = StorageLevel.MEMORY_ONLY_2();break;
            case "MEMORY_ONLY_SER": mode = StorageLevel.MEMORY_ONLY_SER();break;
            case "MEMORY_ONLY_SER_2": mode = StorageLevel.MEMORY_AND_DISK_SER_2();break;
            case "DISK_ONLYDISK_ONLY": mode = StorageLevel.MEMORY_AND_DISK();break;
            case "MEMORY_AND_DISK_2": mode = StorageLevel.MEMORY_AND_DISK_2();break;
            default:mode = StorageLevel.MEMORY_AND_DISK();
        }
        data.persist(mode);
        return data;
    }

    @InsightComponent( name = "Dataset Unpersist", description = "Dataset Unpersist" )
    public static  Dataset unpersist(
            @InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data) {
        data.unpersist();
        return data;
    }

}