All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.data.prepare.spark.dataset.database.HBaseHandler Maven / Gradle / Ivy

package com.datastax.data.prepare.spark.dataset.database;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.datastax.insight.annonation.InsightComponent;
import com.datastax.insight.annonation.InsightComponentArg;
import com.datastax.insight.spec.Operator;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;

import java.math.BigDecimal;
import java.util.*;
import java.util.stream.Collectors;

public class HBaseHandler implements Operator{

    @InsightComponent( name = "loadHbase", description = "loadHbase")
    public static Dataset  load(
            @InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
            @InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
            @InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
            @InsightComponentArg(name = "jsonProps", description = "Hbase参数配置",request = true) JSONObject jsonObject) {
        String action = jsonObject.getString("action");
        if (action.equals("selectAction")) {
            return hbaseSelect(table,zookeepers,confParams,jsonObject);
        } else if (action.equals("filterAction")){
            return hbaseFilter(table,zookeepers,confParams,jsonObject);
        }
        System.out.println("TEST1");
        return null;
    }

    @InsightComponent( name = "loadHbase", description = "loadHbase")
    public static Dataset  load(
            @InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
            @InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
            @InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
            @InsightComponentArg(name = "family", description = "列族",request = true) String family,
            @InsightComponentArg(name = "各列名", description = "每一列以分号隔开",request = true) String columns) {
        if (table == null || zookeepers == null || confParams == null || family == null || columns == null) {
            return null;
        }
        return HBaseOperator.hbase2dataset(table.trim(),zookeepers.trim(),confParams.trim(),family.trim(),columns.trim());
    }
    @InsightComponent( name = "saveToHbase", description = "将dataset数据保存到数据库")
    public static void  HfileLoad(
            @InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
            @InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
            @InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
            @InsightComponentArg(name = "rowKey", description = "rowkey列对应的列名",request = true) String rowKey,
            @InsightComponentArg(name = "family", description = "列族以及对应的列",request = true) String family,
            @InsightComponentArg(name = "columnNames", description = "columnNames",request = true) String columnNames,
            @InsightComponentArg(name = "columnNames", description = "columnNames",request = true) String stagingFolder,
            @InsightComponentArg(name = "dataset", description = "数据集",request = true) Dataset dataset) {
        System.out.println("hello");
        HbaseHFileload.hbaseLoad(table,zookeepers,confParams,rowKey,family,columnNames,stagingFolder,dataset);
    }

    @InsightComponent( name = "saveToHbase", description = "将dataset数据保存到数据库")
    public static void  save(
            @InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
            @InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
            @InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
            @InsightComponentArg(name = "rowKeyCol", description = "rowkey名",request = true) String rowKey,
            @InsightComponentArg(name = "familyAndCols", description = "列族",request = true) String family,
            @InsightComponentArg(name = "colsType", description = "每一列以逗号隔开",request = true) String cols,
            @InsightComponentArg(name = "dataset", description = "数据集",request = true) Dataset dataset) {
        // rowkeyCol->
        // familyAndCol->family1:col1,col2;family2:col3,col4
        // colType->col1:String;col2:Long
        hbaseSave2(table,zookeepers,confParams,rowKey,family,cols,dataset);
    }
    @InsightComponent( name = "saveByGenerateHFile", description = "将dataset数据保存到数据库")
    public static void  saveByGenerateHFile(
            @InsightComponentArg(name = "table", description = "数据库名",request = true) String table,
            @InsightComponentArg(name = "zookeepers", description = "zookeepers列表,以分号隔开",request = true) String zookeepers,
            @InsightComponentArg(name = "hdfsPath", description = "hdfs路径,如 hdfs://node-1:5000",request = true) String hdfsPath,
            @InsightComponentArg(name = "hfilePath", description = "hfile在hdfs上的临时存储路径",request = true) String hfilePath,
            @InsightComponentArg(name = "confParams", description = "conf配置文件",request = true) String confParams,
            @InsightComponentArg(name = "rowKeyCol", description = "rowkey列对应的列名",request = true) String rowKeyCol,
            @InsightComponentArg(name = "familyAndCols", description = "列族以及对应的列",request = true) String familyAndCols,
            @InsightComponentArg(name = "colsType", description = "每一列对应的数据类型",request = true) String colsType,
            @InsightComponentArg(name = "dataset", description = "数据集",request = true) Dataset dataset) {
        // rowkeyCol->
        // familyAndCol->family1:col1,col2;family2:col3,col4
        // colType->col1:String;col2:Long
        hbaseSaveByHFile(table,zookeepers,hdfsPath,hfilePath,confParams,rowKeyCol,familyAndCols,colsType,dataset);
    }
    private static void hbaseSaveByHFile(String table,String zookeepers,String hdfsPath,String hfilePath,
                                         String confParams,String rowKeyCol,String family,String allCols,Dataset dataset){
        // !==! 追加随机路径
        hfilePath = hfilePath.endsWith("/")? hfilePath + UUID.randomUUID() : hfilePath + "/" + UUID.randomUUID();
        System.out.println("hfilePath: " + hfilePath);
        JavaRDD rowJavaRDD = dataset.dropDuplicates(rowKeyCol).sort(rowKeyCol).javaRDD();
        // 对列名排序
        String[] split = allCols.split(",");
        List cols = Arrays.asList(split).stream().sorted().collect(Collectors.toList());
        cols.forEach(r-> System.out.println(r));
        JavaRDD>> rdd1 = rowJavaRDD.mapPartitions(new FlatMapFunction, List>>() {
            @Override
            public Iterator>> call(Iterator rowIterator) throws Exception {
                ArrayList>> results = new ArrayList<>();
                while (rowIterator.hasNext()){
                    Row row = rowIterator.next();
                    ArrayList> result = new ArrayList<>();
                    // 需要先判断rowkey不为空 才能将后面每一个cell的数据添加进去
                    if(row.getAs(rowKeyCol) !=null && !row.getAs(rowKeyCol).toString().trim().isEmpty()){
                        for(int i=0;i((new ImmutableBytesWritable(Bytes.toBytes(row.getAs(rowKeyCol).toString()))),keyValue));
                            }
                        }
                    }
                    results.add(result);
                }

                return results.iterator();
            }
        });
        JavaPairRDD rdd2 = rdd1.flatMapToPair(new PairFlatMapFunction>, ImmutableBytesWritable, KeyValue>() {
            @Override
            public Iterator> call(List> tuple2s) throws Exception {
                return tuple2s.iterator();
            }
        });
        // 调用
        HBaseOperator.saveByGenerateHFile(rdd2,table,zookeepers,confParams,hdfsPath,hfilePath);
    }

//    private static void hbaseSave(String table,String zookeepers,String confParams,JSONObject jsonObject,Dataset dataset){
//        JSONArray selections = jsonObject.getJSONArray("selections");
//        // 列族 列族下的所有列的名称
//        Map map = new HashMap<>();
//        for (Object object:selections) {
//            String family = JSONObject.parseObject(object.toString()).getString("family");
//            JSONArray qualifiersArray = JSONObject.parseArray(JSONObject.parseObject(object.toString()).getString("qualifiers"));
//            String[] qualifiers = qualifiersArray.toArray(new String[qualifiersArray.size()]);
//            map.put(family,qualifiers);
//        }
//        // 列 列的数据类型
//        JSONArray qualifierTypes = jsonObject.getJSONArray("qualifierTypes");
//        Map mapType = new HashMap<>();
//        // 默认没有两个相同的列名?
//        for (Object object:qualifierTypes) {
//            String qualifier = JSONObject.parseObject(object.toString()).getString("qualifier");
//            String type = JSONObject.parseObject(object.toString()).getString("type");
//            mapType.put(qualifier,type);
//        }
//        // 获取rowKey列名
//        String rowKeyCol = jsonObject.getString("rowKeyCol");
//        // 转化dataset为javardd
//        JavaRDD rowJavaRDD = dataset.javaRDD();
//        // 遍历rdd 插入数据 rowKey colfamily colname value
//        JavaPairRDD rdd = rowJavaRDD.mapToPair(row -> {
//            Put put = null;
//            // 遍历列族
//            for(Map.Entry entry : map.entrySet()){
//                String family = entry.getKey();
//                // rowkey String id
//                put = new Put(Bytes.toBytes(row.getAs(rowKeyCol).toString()));
//                // 遍历所有列 按类型转化成二进制后保存到字段中
//                for(String colName:entry.getValue()){
//                    put.addColumn(family.getBytes(),colName.getBytes(),objConvert2Bytes(row.getAs(colName),mapType.get(colName)));
//                }
//            }
//            return new Tuple2<>((new ImmutableBytesWritable()),put);
//        });
//        HBaseOperator.save2(rdd,table,zookeepers,confParams);
//    }

    private static Dataset hbaseSelect(String table,String zookeepers,String confParams,JSONObject jsonObject) {
        JSONArray selections = jsonObject.getJSONArray("selections");
        Map map = new HashMap<>();
        for (Object object:selections) {
            String family = JSONObject.parseObject(object.toString()).getString("family");
            JSONArray qualifiersArray = JSONObject.parseArray(JSONObject.parseObject(object.toString()).getString("qualifiers"));
            String[] qualifiers = qualifiersArray.toArray(new String[qualifiersArray.size()]);
            map.put(family,qualifiers);
        }
        JSONArray qualifierTypes = jsonObject.getJSONArray("qualifierTypes");
        Map mapType = new HashMap<>();
        for (Object object:qualifierTypes) {
            String qualifier = JSONObject.parseObject(object.toString()).getString("qualifier");
            String type = JSONObject.parseObject(object.toString()).getString("type");
            mapType.put(qualifier,type);
        }
        return HBaseOperator.load(table,zookeepers,confParams, JavaConverters.mapAsScalaMapConverter(map).asScala(),JavaConverters.mapAsScalaMapConverter(mapType).asScala());
    }

    private static Dataset hbaseFilter(String table,String zookeepers,String confParams,JSONObject jsonObject) {
        String filterOperator = jsonObject.getString("filterOperator");
        List> filterParams = new ArrayList<>();
        JSONArray params = jsonObject.getJSONArray("params");
        for (Object object:params) {
            Map mapParam = new HashMap<>();
            mapParam.put("filter",JSONObject.parseObject(object.toString()).getString("filter"));
            mapParam.put("compareOp",JSONObject.parseObject(object.toString()).getString("compareOp"));
            mapParam.put("comparator",JSONObject.parseObject(object.toString()).getString("comparator"));
            mapParam.put("family",JSONObject.parseObject(object.toString()).getString("family"));
            mapParam.put("qualifier",JSONObject.parseObject(object.toString()).getString("qualifier"));
            mapParam.put("qualifierPrefixs",JSONObject.parseObject(object.toString()).getString("qualifierPrefixs"));
            mapParam.put("value",JSONObject.parseObject(object.toString()).getString("value"));
            mapParam.put("valueType",JSONObject.parseObject(object.toString()).getString("valueType"));
            filterParams.add(mapParam);
        }
        JSONArray qualifierTypes = jsonObject.getJSONArray("qualifierTypes");
        Map mapType = new HashMap<>();
        for (Object object:qualifierTypes) {
            String qualifier = JSONObject.parseObject(object.toString()).getString("qualifier");
            String type = JSONObject.parseObject(object.toString()).getString("type");
            mapType.put(qualifier,type);
        }
        return HBaseOperator.filterOperator(table,zookeepers,confParams,filterOperator,JavaConversions.asScalaBuffer(filterParams).toList(),JavaConverters.mapAsScalaMapConverter(mapType).asScala());
    }

    private static void hbaseSave2(String table,String zookeepers,String confParams,String rowKeyCol,String familyAndCols,String colsType,Dataset dataset){
        // 列族 列族下的所有列的名称
        Map map = new HashMap<>();
        // familyAndCol->family1:col1,col2;family2:col3,col4
        for(String tmp : familyAndCols.split(";")){
            // family1:col1,col2
            String[] tmp2 = tmp.split(":");
            map.put(tmp2[0],tmp2[1].split(","));
        }

        // 转化dataset为javardd
        JavaRDD rowJavaRDD = dataset.javaRDD().filter(row->{
            Boolean res = row.getAs(rowKeyCol)==null ? false:true;
            return res;
        });

        // 遍历rdd 插入数据 rowKey colfamily colname value
        JavaPairRDD rdd = rowJavaRDD.mapToPair(row -> {
            Put put = null;
            //if(row.getAs(rowKeyCol) !=null && !row.getAs(rowKeyCol).toString().trim().isEmpty()) {
            // 遍历列族
            for (Map.Entry entry : map.entrySet()) {
                String family = entry.getKey();
                // rowkey String id
                put = new Put(Bytes.toBytes(row.getAs(rowKeyCol).toString()));
                // 遍历所有列 按类型转化成二进制后保存到字段中
                for (String colName : entry.getValue()) {
                    if(row.getAs(colName) != null && !row.getAs(colName).toString().trim().isEmpty()) {
                        // 判断列数据不为空
                        put.addColumn(family.getBytes(), colName.getBytes(), row.getAs(colName).toString().getBytes());
                    }
                }
            }
            //}
            return new Tuple2<>((new ImmutableBytesWritable()),put);
        });
        HBaseOperator.save2(rdd,table,zookeepers,confParams);
    }
    /**
     * 将对象转化成其他类型后获取二进制数据
     * @param value
     * @param valueType
     * @return
     */
    private static byte[] objConvert2Bytes(Object value,String valueType){
        byte[] bytes = null;
        switch (valueType){
            case "Int":
                bytes = Bytes.toBytes((Integer) value);
                break;
            case "Short":
                bytes = Bytes.toBytes((Short) value);
                break;
            case "BigDecimal":
                bytes = Bytes.toBytes((BigDecimal) value);
                break;
            case "Long":
                bytes = Bytes.toBytes((Long) value);
                break;
            case "Double":
                bytes = Bytes.toBytes((Double) value);
                break;
            case "Float":
                bytes = Bytes.toBytes((Float) value);
                break;
            case "String":
                bytes = Bytes.toBytes((String) value);
                break;
            case "Boolean":
                bytes = Bytes.toBytes((Boolean) value);
                break;
            default:
                // 匹配不到就转化成String
                bytes = Bytes.toBytes((String) value);
        }
        return bytes;
    }
/**
 * json说明
 */
//   {
//        action:selectAction (选择属性列)
//        selections:[	       (选择信息)
//            {
//                    family:输入框               (列族)
//                    qualifiers:[                (列)
//                     "列1","列2"
//				        ...
//				    ]
//		    }
//	        ...
//	       ]
//
//        qualifierTypes:[                                (列的数据类型)
//            {
//                    qualifier:输入框           (列)
//                    type:[Int,Short,Long,BigDecimal,Double,Float,String,Boolean]	(数据类型)
//		    }
//	    ....
//	    ]
//    }
//
//    {
//        action:filterAction                               (过滤器)
//        filterOperator:[pass_all,pass_one]                (过滤范围)
//        params:[                                          (过滤信息)
//            {
//                filter:[rowKey,family,simpleColumn,qualifier,multipleColumn]  (过滤方式)
//                compareOp:[LESS,LESS_OR_EQUAL,EQUAL,NOT_EQUAL,GREATER_OR_EQUAL,GREATER,NO_OP]  (比较运算符)
//                comparator:[BinaryComparator,BinaryPrefixComparator,NullComparator,RegexStringComparator,SubstringComparator,LongComparator]  (比较器)
//                family:输入框   (列族)
//                qualifier:输入框   (列)
//                qualifierPrefixs:输入框  (列名前缀)
//                value:输入框  (过滤参数)
//                valueType:[Int,Short,Long,BigDecimal,Double,Float,String,Boolean]  (过滤参数的数据类型)
//		    }
//	    ....
//	    ]
//        qualifierTypes:[
//            {
//                qualifier:输入框
//                type:[Int,Short,Long,BigDecimal,Double,Float,String,Boolean]
//            }
//	    ....
//	    ]
//
//    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy