All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.datastax.data.prepare.spark.dataset.database.HbaseHFileload.scala Maven / Gradle / Ivy
package com.datastax.data.prepare.spark.dataset.database
import java.util.Date
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.{KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, HTable, Put, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat, HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.spark.sql.DataFrame
object HbaseHFileload {
def hbaseLoad(tableName:String, zookeepers:String,confParams:String,rowkey:String,family:String,columnNames:String,stagingFolder:String,dataset:DataFrame):Any = {
val conf = HBaseOperator.init(tableName,zookeepers,confParams)
val value = dataset
.dropDuplicates(rowkey)
val hfilePath = stagingFolder + "-" + new Date().getTime
implicit val sort_1 = new Ordering[(String,(String,String,String))]{
override def compare(x: (String, (String, String, String)), y: (String, (String, String, String))): Int = {
val i = Bytes.compareTo(Bytes.toBytes(x._1),Bytes.toBytes(y._1))
i
}
}
implicit val sort_2 = new Ordering[(String)]{
override def compare(x: String, y: String): Int = {
val i = Bytes.compareTo(Bytes.toBytes(x),Bytes.toBytes(y))
i
}
}
val columns = if (columnNames != null) columnNames.split(",") else null
val sortedColumns = columns.sortBy(xk=>xk)
val value_rdd = value.sort(rowkey).rdd.flatMap(data => {
val tuples = for (i <- 0 to sortedColumns.length - 1) yield {
val colName = columns(i)
val colValue = data.get(i)
(rowkey, (family, colName, colValue.toString))
}
val sorted = tuples.sortBy(yk => yk)
sorted
})
val sourceRDD = value_rdd.sortBy(yu=>yu)
println(sourceRDD.partitions.size)
val resultImmutable = sourceRDD.map(data => {
val rowKey = data._1
val colFa = data._2._1
val colName = data._2._2
val colValue = data._2._3
(new ImmutableBytesWritable(Bytes.toBytes(rowKey)), new KeyValue(Bytes.toBytes(rowKey),colFa.getBytes(), colName.toString.getBytes(), Bytes.toBytes(colValue)))
})
//生成HFile
resultImmutable.saveAsNewAPIHadoopFile(hfilePath,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat],
conf)
val load = new LoadIncrementalHFiles(conf)
conf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 2048)
val conn = ConnectionFactory.createConnection(conf)
val hTable = new HTable(conf,tableName)
hTable.setAutoFlush(false)
hTable.setWriteBufferSize(1024000)
try {
val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName))
val job = Job.getInstance(conf)
job.setJobName("loadHBase")
job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
job.setMapOutputValueClass(classOf[Put])
HFileOutputFormat2.configureIncrementalLoad(job, hTable, regionLocator)
load.doBulkLoad(new Path(hfilePath), hTable.asInstanceOf[HTable])
} finally {
hTable.close()
conn.close()
}
}
}