org.apache.hadoop.hbase.spark.HBaseRDDFunctions.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-spark Show documentation
There is a newer version: 2.0.0-alpha4
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hbase.spark

import java.util

import org.apache.hadoop.hbase.{HConstants, TableName}
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

/**
 * HBaseRDDFunctions contains a set of implicit functions that can be
 * applied to a Spark RDD so that we can easily interact with HBase
 */
@InterfaceAudience.Public
object HBaseRDDFunctions
{

  /**
   * These are implicit methods for a RDD that contains any type of
   * data.
   *
   * @param rdd This is for rdd of any type
   * @tparam T  This is any type
   */
  implicit class GenericHBaseRDDFunctions[T](val rdd: RDD[T]) {

    /**
     * Implicit method that gives easy access to HBaseContext's bulk
     * put.  This will not return a new RDD.  Think of it like a foreach
     *
     * @param hc         The hbaseContext object to identify which
     *                   HBase cluster connection to use
     * @param tableName  The tableName that the put will be sent to
     * @param f          The function that will turn the RDD values
     *                   into HBase Put objects.
     */
    def hbaseBulkPut(hc: HBaseContext,
                     tableName: TableName,
                     f: (T) => Put): Unit = {
      hc.bulkPut(rdd, tableName, f)
    }

    /**
     * Implicit method that gives easy access to HBaseContext's bulk
     * get.  This will return a new RDD.  Think about it as a RDD map
     * function.  In that every RDD value will get a new value out of
     * HBase.  That new value will populate the newly generated RDD.
     *
     * @param hc             The hbaseContext object to identify which
     *                       HBase cluster connection to use
     * @param tableName      The tableName that the put will be sent to
     * @param batchSize      How many gets to execute in a single batch
     * @param f              The function that will turn the RDD values
     *                       in HBase Get objects
     * @param convertResult  The function that will convert a HBase
     *                       Result object into a value that will go
     *                       into the resulting RDD
     * @tparam R             The type of Object that will be coming
     *                       out of the resulting RDD
     * @return               A resulting RDD with type R objects
     */
    def hbaseBulkGet[R: ClassTag](hc: HBaseContext,
                            tableName: TableName, batchSize:Int,
                            f: (T) => Get, convertResult: (Result) => R): RDD[R] = {
      hc.bulkGet[T, R](tableName, batchSize, rdd, f, convertResult)
    }

    /**
     * Implicit method that gives easy access to HBaseContext's bulk
     * get.  This will return a new RDD.  Think about it as a RDD map
     * function.  In that every RDD value will get a new value out of
     * HBase.  That new value will populate the newly generated RDD.
     *
     * @param hc             The hbaseContext object to identify which
     *                       HBase cluster connection to use
     * @param tableName      The tableName that the put will be sent to
     * @param batchSize      How many gets to execute in a single batch
     * @param f              The function that will turn the RDD values
     *                       in HBase Get objects
     * @return               A resulting RDD with type R objects
     */
    def hbaseBulkGet(hc: HBaseContext,
                                  tableName: TableName, batchSize:Int,
                                  f: (T) => Get): RDD[(ImmutableBytesWritable, Result)] = {
      hc.bulkGet[T, (ImmutableBytesWritable, Result)](tableName,
        batchSize, rdd, f,
        result => if (result != null && result.getRow != null) {
          (new ImmutableBytesWritable(result.getRow), result)
        } else {
          null
        })
    }

    /**
     * Implicit method that gives easy access to HBaseContext's bulk
     * Delete.  This will not return a new RDD.
     *
     * @param hc         The hbaseContext object to identify which HBase
     *                   cluster connection to use
     * @param tableName  The tableName that the deletes will be sent to
     * @param f          The function that will convert the RDD value into
     *                   a HBase Delete Object
     * @param batchSize  The number of Deletes to be sent in a single batch
     */
    def hbaseBulkDelete(hc: HBaseContext,
                        tableName: TableName, f:(T) => Delete, batchSize:Int): Unit = {
      hc.bulkDelete(rdd, tableName, f, batchSize)
    }

    /**
     * Implicit method that gives easy access to HBaseContext's
     * foreachPartition method.  This will ack very much like a normal RDD
     * foreach method but for the fact that you will now have a HBase connection
     * while iterating through the values.
     *
     * @param hc  The hbaseContext object to identify which HBase
     *            cluster connection to use
     * @param f   This function will get an iterator for a Partition of an
     *            RDD along with a connection object to HBase
     */
    def hbaseForeachPartition(hc: HBaseContext,
                              f: (Iterator[T], Connection) => Unit): Unit = {
      hc.foreachPartition(rdd, f)
    }

    /**
     * Implicit method that gives easy access to HBaseContext's
     * mapPartitions method.  This will ask very much like a normal RDD
     * map partitions method but for the fact that you will now have a
     * HBase connection while iterating through the values
     *
     * @param hc  The hbaseContext object to identify which HBase
     *            cluster connection to use
     * @param f   This function will get an iterator for a Partition of an
     *            RDD along with a connection object to HBase
     * @tparam R  This is the type of objects that will go into the resulting
     *            RDD
     * @return    A resulting RDD of type R
     */
    def hbaseMapPartitions[R: ClassTag](hc: HBaseContext,
                                        f: (Iterator[T], Connection) => Iterator[R]):
    RDD[R] = {
      hc.mapPartitions[T,R](rdd, f)
    }

    /**
     * Spark Implementation of HBase Bulk load for wide rows or when
     * values are not already combined at the time of the map process
     *
     * A Spark Implementation of HBase Bulk load
     *
     * This will take the content from an existing RDD then sort and shuffle
     * it with respect to region splits.  The result of that sort and shuffle
     * will be written to HFiles.
     *
     * After this function is executed the user will have to call
     * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase
     *
     * Also note this version of bulk load is different from past versions in
     * that it includes the qualifier as part of the sort process. The
     * reason for this is to be able to support rows will very large number
     * of columns.
     *
     * @param tableName                      The HBase table we are loading into
     * @param flatMap                        A flapMap function that will make every row in the RDD
     *                                       into N cells for the bulk load
     * @param stagingDir                     The location on the FileSystem to bulk load into
     * @param familyHFileWriteOptionsMap     Options that will define how the HFile for a
     *                                       column family is written
     * @param compactionExclude              Compaction excluded for the HFiles
     * @param maxSize                        Max size for the HFiles before they roll
     */
    def hbaseBulkLoad(hc: HBaseContext,
                         tableName: TableName,
                         flatMap: (T) => Iterator[(KeyFamilyQualifier, Array[Byte])],
                         stagingDir:String,
                         familyHFileWriteOptionsMap:
                         util.Map[Array[Byte], FamilyHFileWriteOptions] =
                         new util.HashMap[Array[Byte], FamilyHFileWriteOptions](),
                         compactionExclude: Boolean = false,
                         maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE):Unit = {
      hc.bulkLoad(rdd, tableName,
        flatMap, stagingDir, familyHFileWriteOptionsMap,
        compactionExclude, maxSize)
    }

    /**
     * Implicit method that gives easy access to HBaseContext's
     * bulkLoadThinRows method.
     *
     * Spark Implementation of HBase Bulk load for short rows some where less then
     * a 1000 columns.  This bulk load should be faster for tables will thinner
     * rows then the other spark implementation of bulk load that puts only one
     * value into a record going into a shuffle
     *
     * This will take the content from an existing RDD then sort and shuffle
     * it with respect to region splits.  The result of that sort and shuffle
     * will be written to HFiles.
     *
     * After this function is executed the user will have to call
     * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase
     *
     * In this implementation only the rowKey is given to the shuffle as the key
     * and all the columns are already linked to the RowKey before the shuffle
     * stage.  The sorting of the qualifier is done in memory out side of the
     * shuffle stage
     *
     * @param tableName                      The HBase table we are loading into
     * @param mapFunction                    A function that will convert the RDD records to
     *                                       the key value format used for the shuffle to prep
     *                                       for writing to the bulk loaded HFiles
     * @param stagingDir                     The location on the FileSystem to bulk load into
     * @param familyHFileWriteOptionsMap     Options that will define how the HFile for a
     *                                       column family is written
     * @param compactionExclude              Compaction excluded for the HFiles
     * @param maxSize                        Max size for the HFiles before they roll
     */
    def hbaseBulkLoadThinRows(hc: HBaseContext,
                      tableName: TableName,
                      mapFunction: (T) =>
                        (ByteArrayWrapper, FamiliesQualifiersValues),
                      stagingDir:String,
                      familyHFileWriteOptionsMap:
                      util.Map[Array[Byte], FamilyHFileWriteOptions] =
                      new util.HashMap[Array[Byte], FamilyHFileWriteOptions](),
                      compactionExclude: Boolean = false,
                      maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE):Unit = {
      hc.bulkLoadThinRows(rdd, tableName,
        mapFunction, stagingDir, familyHFileWriteOptionsMap,
        compactionExclude, maxSize)
    }
  }
}