com.datastax.data.prepare.spark.dataset.DataSetWriter Maven / Gradle / Ivy
The newest version!
package com.datastax.data.prepare.spark.dataset;
import com.datastax.data.prepare.spark.dataset.params.Aggregate;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.core.entity.Cache;
import com.datastax.insight.core.entity.DBSource;
import com.datastax.insight.spec.Operator;
import com.datastax.insight.core.service.PersistService;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.Dataset;
import org.apache.spark.storage.StorageLevel;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Properties;
public class DataSetWriter implements Operator {
public static void save(Dataset dataset, String format, String mode, String path) throws Exception {
save(dataset, format, mode, path, true);
}
public static void save(Dataset dataset, String format, String mode, String path, boolean withHeader) throws Exception {
Configuration conf = new Configuration();
FileSystem hdfs = FileSystem.get(new URI(path), conf);
String temp = new Path(hdfs.getWorkingDirectory(), "temp_" + String.valueOf(System.currentTimeMillis())).toString();
DataFrameWriter writer = dataset.write();
if(withHeader) {
writer.option("header", true);
}
writer.format(format).mode(mode).save(temp);
if(hdfs.exists(new Path(path))) {
hdfs.delete(new Path(path), true);
}
FileUtil.copyMerge(hdfs, new Path(temp), hdfs, new Path(path), true, conf, null);
PersistService.invoke("com.datastax.insight.agent.dao.InsightDAO",
"saveFlowResult",
new String[]{Integer.class.getTypeName(), String.class.getTypeName()},
new Object[]{PersistService.getFlowId(), path});
}
public static void jdbc(Dataset dataset, String mode, DBSource dbSource,String table) {
if(dbSource!=null) {
Properties properties=new Properties();
properties.put("driver",dbSource.getDriver());
properties.put("user",dbSource.getUser());
properties.put("password",dbSource.getPassword());
//System.out.println(dbSource.getUrl()+"==>"+dbSource.getUser());
dataset.write().mode(mode).jdbc(dbSource.getUrl(), table, properties);
}
}
public static void jdbc(Dataset dataset, String mode, String dbID,String table) {
DBSource dbSource=getDBSource(dbID);
jdbc(dataset,mode,dbSource,table);
}
public static void jdbc(Dataset dataset, String mode,String url, String driver,String user,String password,String table) {
Properties properties=new Properties();
properties.put("driver",driver);
properties.put("user",user);
properties.put("password",password);
if ("overwrite".equals(mode)) {
dataset.write().mode(mode)
.option("truncate",true)
.option("batchsize",1000)
.jdbc(url, table, properties);
} else {
dataset.write().mode(mode)
.option("batchsize",1000)
.jdbc(url, table, properties);
}
}
private static DBSource getDBSource(String id) {
List dbSourceList = (List)Cache.getCache("dbsources");
if(dbSourceList != null) {
return dbSourceList.stream().filter(d->d.getId() == Integer.parseInt(id)).
findFirst().orElse(null);
}
return null;
}
public static void incrementDataSave(Dataset dataset, String format, String mode, String path,String fileName, boolean withHeader) {
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMdd");
String nowDay = dateFormat.format(new Date());
String rootPath = path.endsWith("/") ? path : path + "/";
String savePath = rootPath + nowDay + "/" + fileName;
DataFrameWriter writer = dataset.write();
if(withHeader) {
writer.option("header", true);
}
writer.format(format).mode(mode).save(savePath);
}
@InsightComponent( name = "checkPoint", description = "checkPoint" )
public static Dataset checkPoint(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data,
@InsightComponentArg(name = "checkPoint地址", description = "checkPoint地址") String address) {
data.persist(StorageLevel.MEMORY_AND_DISK());
SparkContextBuilder.getContext().setCheckpointDir(address);
data.checkpoint();
return data;
}
@InsightComponent( name = "删除hdfs文件", description = "删除hdfs文件" )
public static Dataset deleteHdfsFile(
@InsightComponentArg(name = "hdfs文件地址", description = "hdfs文件地址",request = true) String hdfsFile) {
try {
FileSystem hdfs = FileSystem.get(new URI(hdfsFile),new Configuration());
Path path = new Path(hdfsFile);
if (hdfs.exists(path)) {
hdfs.delete(path,true);
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
@InsightComponent( name = "Dataset Persist", description = "Dataset Persist" )
public static Dataset persist(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data,
@InsightComponentArg(name = "缓存方式", description = "缓存方式", items = "DISK_ONLY;DISK_ONLY_2;MEMORY_ONLY;MEMORY_ONLY_2;MEMORY_ONLY_SER;MEMORY_ONLY_SER_2;MEMORY_AND_DISK;MEMORY_AND_DISK_2;MEMORY_AND_DISK_SER;MEMORY_AND_DISK_SER_2") String persistMode) {
StorageLevel mode = null;
switch (persistMode) {
case "DISK_ONLY": mode = StorageLevel.DISK_ONLY();break;
case "DISK_ONLY_2": mode = StorageLevel.DISK_ONLY_2();break;
case "MEMORY_ONLY": mode = StorageLevel.MEMORY_ONLY();break;
case "MEMORY_ONLY_2": mode = StorageLevel.MEMORY_ONLY_2();break;
case "MEMORY_ONLY_SER": mode = StorageLevel.MEMORY_ONLY_SER();break;
case "MEMORY_ONLY_SER_2": mode = StorageLevel.MEMORY_AND_DISK_SER_2();break;
case "DISK_ONLYDISK_ONLY": mode = StorageLevel.MEMORY_AND_DISK();break;
case "MEMORY_AND_DISK_2": mode = StorageLevel.MEMORY_AND_DISK_2();break;
default:mode = StorageLevel.MEMORY_AND_DISK();
}
data.persist(mode);
return data;
}
@InsightComponent( name = "Dataset Unpersist", description = "Dataset Unpersist" )
public static Dataset unpersist(
@InsightComponentArg(externalInput = true, name = "数据集", description = "数据集",defaultValue = "${output}") Dataset data) {
data.unpersist();
return data;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy