com.datastax.insight.ml.spark.hbase.HBaseRDDHandler Maven / Gradle / Ivy
The newest version!
package com.datastax.insight.ml.spark.hbase;
import com.datastax.insight.core.driver.SparkContextBuilder;
import com.datastax.insight.spec.RDDOperator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.io.IOException;
public class HBaseRDDHandler implements RDDOperator {
// sample code to load data from hbase
// public static JavaPairRDD load(String table){
// Configuration conf = HBaseConfiguration.create();
// conf.set("hbase.zookeeper.quorum", "datastax-cdh61,datastax-cdh25,datastax-cdh66");
// conf.set("hbase.zookeeper.property.clientPort", "2181");
// conf.set(TableInputFormat.INPUT_TABLE,table);
//
// JavaSparkContext sc= SparkContextBuilder.getJContext();
// JavaPairRDD hBaseRDD = sc.newAPIHadoopRDD(conf,
// TableInputFormat.class, ImmutableBytesWritable.class,
// Result.class);
//
// return hBaseRDD;
// }
public static JavaPairRDD load(HBaseConfiguration conf,String table){
conf.set(TableInputFormat.INPUT_TABLE,table);
JavaSparkContext sc= SparkContextBuilder.getJContext();
JavaPairRDD hBaseRDD = sc.newAPIHadoopRDD(conf,
TableInputFormat.class, ImmutableBytesWritable.class,
Result.class);
return hBaseRDD;
}
public static void save(HBaseConfiguration conf,JavaPairRDD rdd,String table) throws IOException {
conf.set(TableOutputFormat.OUTPUT_TABLE,table);
Job job=Job.getInstance(conf);
job.setOutputFormatClass(TableOutputFormat.class);
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration());
}
public static void hFile(HBaseConfiguration conf,JavaPairRDD rdd,String path) throws IOException {
rdd.saveAsNewAPIHadoopFile(path,ImmutableBytesWritable.class, KeyValue.class, HFileOutputFormat2.class,conf);
}
public static void hFileToHBase(HBaseConfiguration conf,String path,String tableName) throws Exception {
LoadIncrementalHFiles load=new LoadIncrementalHFiles(conf);
Connection conn=ConnectionFactory.createConnection(conf);
Table table=conn.getTable(TableName.valueOf(tableName));
try {
//获取hbase表的region分布
RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName));
//创建一个hadoop的mapreduce的job
Job job = Job.getInstance(conf);
//设置job名称
job.setJobName("DumpFile");
//此处最重要,需要设置文件输出的key,因为我们要生成HFile,所以outkey要用ImmutableBytesWritable
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
//输出文件的内容KeyValue
job.setMapOutputValueClass(KeyValue.class);
//配置HFileOutputFormat2的信息
HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator);
//开始导入
load.doBulkLoad(new Path(path), (HTable) table);
} finally {
table.close();
conn.close();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy