com.datastax.data.prepare.spark.dataset.database.HbaseHFileload.scala Maven / Gradle / Ivy

Go to download
package com.datastax.data.prepare.spark.dataset.database
import java.util.Date

import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.{KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, HTable, Put, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat, HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.spark.sql.DataFrame

object HbaseHFileload {

  def hbaseLoad(tableName:String, zookeepers:String,confParams:String,rowkey:String,family:String,columnNames:String,stagingFolder:String,dataset:DataFrame):Any = {

    val conf = HBaseOperator.init(tableName,zookeepers,confParams)
    val value = dataset
         .dropDuplicates(rowkey)
    val hfilePath = stagingFolder + "-" + new Date().getTime

        implicit  val sort_1 = new Ordering[(String,(String,String,String))]{
          override def compare(x: (String, (String, String, String)), y: (String, (String, String, String))): Int = {
            val i = Bytes.compareTo(Bytes.toBytes(x._1),Bytes.toBytes(y._1))
            i
          }
        }
        implicit val sort_2 = new Ordering[(String)]{
          override def compare(x: String, y: String): Int = {
            val i = Bytes.compareTo(Bytes.toBytes(x),Bytes.toBytes(y))
            i
          }
        }
    val columns = if (columnNames != null) columnNames.split(",") else null
    val sortedColumns = columns.sortBy(xk=>xk)
    val value_rdd = value.sort(rowkey).rdd.flatMap(data => {
      val tuples = for (i <- 0 to sortedColumns.length - 1) yield {
        val colName = columns(i)
        val colValue = data.get(i)
        (rowkey, (family, colName, colValue.toString))
      }
      val sorted = tuples.sortBy(yk => yk)
      sorted
    })
    val sourceRDD = value_rdd.sortBy(yu=>yu)
    println(sourceRDD.partitions.size)
    val resultImmutable = sourceRDD.map(data => {
      val rowKey = data._1
      val colFa = data._2._1
      val colName = data._2._2
      val colValue = data._2._3
      (new ImmutableBytesWritable(Bytes.toBytes(rowKey)), new KeyValue(Bytes.toBytes(rowKey),colFa.getBytes(), colName.toString.getBytes(), Bytes.toBytes(colValue)))
    })
    //生成HFile
    resultImmutable.saveAsNewAPIHadoopFile(hfilePath,
      classOf[ImmutableBytesWritable],
      classOf[KeyValue],
      classOf[HFileOutputFormat],
      conf)

    val load = new LoadIncrementalHFiles(conf)
    conf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 2048)
    val conn = ConnectionFactory.createConnection(conf)
    val hTable = new HTable(conf,tableName)
    hTable.setAutoFlush(false)
    hTable.setWriteBufferSize(1024000)
    try {
      val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName))
      val job = Job.getInstance(conf)
      job.setJobName("loadHBase")
      job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
      job.setMapOutputValueClass(classOf[Put])
      HFileOutputFormat2.configureIncrementalLoad(job, hTable, regionLocator)
      load.doBulkLoad(new Path(hfilePath), hTable.asInstanceOf[HTable])
    } finally {
      hTable.close()
      conn.close()
    }
  }

}