
com.datastax.data.prepare.spark.dataset.database.HBaseHandler Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset.database;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.insight.spec.Operator;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;
import java.math.BigDecimal;
import java.util.*;
import java.util.stream.Collectors;
public class HBaseHandler implements Operator{
@InsightComponent( name = "loadHbase", description = "loadHbase")
public static Dataset load(
@InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
@InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
@InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
@InsightComponentArg(name = "jsonProps", description = "Hbase参数配置",request = true) JSONObject jsonObject) {
String action = jsonObject.getString("action");
if (action.equals("selectAction")) {
return hbaseSelect(table,zookeepers,confParams,jsonObject);
} else if (action.equals("filterAction")){
return hbaseFilter(table,zookeepers,confParams,jsonObject);
}
System.out.println("TEST1");
return null;
}
@InsightComponent( name = "loadHbase", description = "loadHbase")
public static Dataset load(
@InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
@InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
@InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
@InsightComponentArg(name = "family", description = "列族",request = true) String family,
@InsightComponentArg(name = "各列名", description = "每一列以分号隔开",request = true) String columns) {
if (table == null || zookeepers == null || confParams == null || family == null || columns == null) {
return null;
}
return HBaseOperator.hbase2dataset(table.trim(),zookeepers.trim(),confParams.trim(),family.trim(),columns.trim());
}
@InsightComponent( name = "saveToHbase", description = "将dataset数据保存到数据库")
public static void HfileLoad(
@InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
@InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
@InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
@InsightComponentArg(name = "rowKey", description = "rowkey列对应的列名",request = true) String rowKey,
@InsightComponentArg(name = "family", description = "列族以及对应的列",request = true) String family,
@InsightComponentArg(name = "columnNames", description = "columnNames",request = true) String columnNames,
@InsightComponentArg(name = "columnNames", description = "columnNames",request = true) String stagingFolder,
@InsightComponentArg(name = "dataset", description = "数据集",request = true) Dataset dataset) {
System.out.println("hello");
HbaseHFileload.hbaseLoad(table,zookeepers,confParams,rowKey,family,columnNames,stagingFolder,dataset);
}
@InsightComponent( name = "saveToHbase", description = "将dataset数据保存到数据库")
public static void save(
@InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
@InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
@InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
@InsightComponentArg(name = "rowKeyCol", description = "rowkey名",request = true) String rowKey,
@InsightComponentArg(name = "familyAndCols", description = "列族",request = true) String family,
@InsightComponentArg(name = "colsType", description = "每一列以逗号隔开",request = true) String cols,
@InsightComponentArg(name = "dataset", description = "数据集",request = true) Dataset dataset) {
// rowkeyCol->
// familyAndCol->family1:col1,col2;family2:col3,col4
// colType->col1:String;col2:Long
hbaseSave2(table,zookeepers,confParams,rowKey,family,cols,dataset);
}
@InsightComponent( name = "saveByGenerateHFile", description = "将dataset数据保存到数据库")
public static void saveByGenerateHFile(
@InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
@InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
@InsightComponentArg(name = "hdfsPath", description = "hdfs路径,如 hdfs://node-1:5000",request = true) String hdfsPath,
@InsightComponentArg(name = "hfilePath", description = "hfile在hdfs上的临时存储路径",request = true) String hfilePath,
@InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
@InsightComponentArg(name = "rowKeyCol", description = "rowkey列对应的列名",request = true) String rowKeyCol,
@InsightComponentArg(name = "familyAndCols", description = "列族以及对应的列",request = true) String familyAndCols,
@InsightComponentArg(name = "colsType", description = "每一列对应的数据类型",request = true) String colsType,
@InsightComponentArg(name = "dataset", description = "数据集",request = true) Dataset dataset) {
// rowkeyCol->
// familyAndCol->family1:col1,col2;family2:col3,col4
// colType->col1:String;col2:Long
hbaseSaveByHFile(table,zookeepers,hdfsPath,hfilePath,confParams,rowKeyCol,familyAndCols,colsType,dataset);
}
private static void hbaseSaveByHFile(String table,String zookeepers,String hdfsPath,String hfilePath,
String confParams,String rowKeyCol,String family,String allCols,Dataset dataset){
// !==! 追加随机路径
hfilePath = hfilePath.endsWith("/")? hfilePath + UUID.randomUUID() : hfilePath + "/" + UUID.randomUUID();
System.out.println("hfilePath: " + hfilePath);
JavaRDD rowJavaRDD = dataset.dropDuplicates(rowKeyCol).sort(rowKeyCol).javaRDD();
// 对列名排序
String[] split = allCols.split(",");
List cols = Arrays.asList(split).stream().sorted().collect(Collectors.toList());
cols.forEach(r-> System.out.println(r));
JavaRDD>> rdd1 = rowJavaRDD.mapPartitions(new FlatMapFunction, List>>() {
@Override
public Iterator>> call(Iterator rowIterator) throws Exception {
ArrayList>> results = new ArrayList<>();
while (rowIterator.hasNext()){
Row row = rowIterator.next();
ArrayList> result = new ArrayList<>();
// 需要先判断rowkey不为空 才能将后面每一个cell的数据添加进去
if(row.getAs(rowKeyCol) !=null && !row.getAs(rowKeyCol).toString().trim().isEmpty()){
for(int i=0;i((new ImmutableBytesWritable(Bytes.toBytes(row.getAs(rowKeyCol).toString()))),keyValue));
}
}
}
results.add(result);
}
return results.iterator();
}
});
JavaPairRDD rdd2 = rdd1.flatMapToPair(new PairFlatMapFunction>, ImmutableBytesWritable, KeyValue>() {
@Override
public Iterator> call(List> tuple2s) throws Exception {
return tuple2s.iterator();
}
});
// 调用
HBaseOperator.saveByGenerateHFile(rdd2,table,zookeepers,confParams,hdfsPath,hfilePath);
}
// private static void hbaseSave(String table,String zookeepers,String confParams,JSONObject jsonObject,Dataset dataset){
// JSONArray selections = jsonObject.getJSONArray("selections");
// // 列族 列族下的所有列的名称
// Map map = new HashMap<>();
// for (Object object:selections) {
// String family = JSONObject.parseObject(object.toString()).getString("family");
// JSONArray qualifiersArray = JSONObject.parseArray(JSONObject.parseObject(object.toString()).getString("qualifiers"));
// String[] qualifiers = qualifiersArray.toArray(new String[qualifiersArray.size()]);
// map.put(family,qualifiers);
// }
// // 列 列的数据类型
// JSONArray qualifierTypes = jsonObject.getJSONArray("qualifierTypes");
// Map mapType = new HashMap<>();
// // 默认没有两个相同的列名?
// for (Object object:qualifierTypes) {
// String qualifier = JSONObject.parseObject(object.toString()).getString("qualifier");
// String type = JSONObject.parseObject(object.toString()).getString("type");
// mapType.put(qualifier,type);
// }
// // 获取rowKey列名
// String rowKeyCol = jsonObject.getString("rowKeyCol");
// // 转化dataset为javardd
// JavaRDD rowJavaRDD = dataset.javaRDD();
// // 遍历rdd 插入数据 rowKey colfamily colname value
// JavaPairRDD rdd = rowJavaRDD.mapToPair(row -> {
// Put put = null;
// // 遍历列族
// for(Map.Entry entry : map.entrySet()){
// String family = entry.getKey();
// // rowkey String id
// put = new Put(Bytes.toBytes(row.getAs(rowKeyCol).toString()));
// // 遍历所有列 按类型转化成二进制后保存到字段中
// for(String colName:entry.getValue()){
// put.addColumn(family.getBytes(),colName.getBytes(),objConvert2Bytes(row.getAs(colName),mapType.get(colName)));
// }
// }
// return new Tuple2<>((new ImmutableBytesWritable()),put);
// });
// HBaseOperator.save2(rdd,table,zookeepers,confParams);
// }
private static Dataset hbaseSelect(String table,String zookeepers,String confParams,JSONObject jsonObject) {
JSONArray selections = jsonObject.getJSONArray("selections");
Map map = new HashMap<>();
for (Object object:selections) {
String family = JSONObject.parseObject(object.toString()).getString("family");
JSONArray qualifiersArray = JSONObject.parseArray(JSONObject.parseObject(object.toString()).getString("qualifiers"));
String[] qualifiers = qualifiersArray.toArray(new String[qualifiersArray.size()]);
map.put(family,qualifiers);
}
JSONArray qualifierTypes = jsonObject.getJSONArray("qualifierTypes");
Map mapType = new HashMap<>();
for (Object object:qualifierTypes) {
String qualifier = JSONObject.parseObject(object.toString()).getString("qualifier");
String type = JSONObject.parseObject(object.toString()).getString("type");
mapType.put(qualifier,type);
}
return HBaseOperator.load(table,zookeepers,confParams, JavaConverters.mapAsScalaMapConverter(map).asScala(),JavaConverters.mapAsScalaMapConverter(mapType).asScala());
}
private static Dataset hbaseFilter(String table,String zookeepers,String confParams,JSONObject jsonObject) {
String filterOperator = jsonObject.getString("filterOperator");
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy