com.datastax.spark.connector.RDDFunctions.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-scylladb-connector_2.12 Show documentation
spark-scylladb-connector
The newest version!
package com.datastax.spark.connector


import java.net.InetAddress

import com.datastax.spark.connector.cql._
import com.datastax.spark.connector.mapper.ColumnMapper
import com.datastax.spark.connector.rdd.partitioner.{CassandraPartitionedRDD, ReplicaPartitioner}
import com.datastax.spark.connector.rdd.reader._
import com.datastax.spark.connector.rdd._
import com.datastax.spark.connector.writer.{ReplicaLocator, _}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

/** Provides Cassandra-specific methods on [[org.apache.spark.rdd.RDD RDD]] */
class RDDFunctions[T](rdd: RDD[T]) extends WritableToCassandra[T] with Serializable {

  override val sparkContext: SparkContext = rdd.sparkContext

  /**
   * Saves the data from [[org.apache.spark.rdd.RDD RDD]] to a Cassandra table. Uses the specified column names.
   * @see [[com.datastax.spark.connector.writer.WritableToCassandra]]
   */
  def saveToCassandra(
    keyspaceName: String,
    tableName: String,
    columns: ColumnSelector = AllColumns,
    writeConf: WriteConf = WriteConf.fromSparkConf(sparkContext.getConf),
    tokenRangeAccumulator: Option[TokenRangeAccumulator] = None
    )(
  implicit
    connector: CassandraConnector = CassandraConnector(sparkContext),
    rwf: RowWriterFactory[T]): Unit = {

    val writer = TableWriter(connector, keyspaceName, tableName, columns, writeConf, partitions = rdd.partitions, tokenRangeAcc = tokenRangeAccumulator)
    rdd.sparkContext.runJob(rdd, writer.write _)
  }
  /**
   * Saves the data from [[org.apache.spark.rdd.RDD RDD]] to a new table defined by the given `TableDef`.
   *
   * First it creates a new table with all columns from the `TableDef`
   * and then it saves RDD content in the same way as [[saveToCassandra]].
   * The table must not exist prior to this call.
   *
   * @param table table definition used to create a new table
   * @param columns Selects the columns to save data to.
   *                Uses only the unique column names, and you must select at least all primary key
   *                 columns. All other fields are discarded. Non-selected property/column names are left unchanged.
   *                 This parameter does not affect table creation.
   * @param writeConf additional configuration object allowing to set consistency level, batch size, etc.
   * @param connector optional, implicit connector to Cassandra
   * @param rwf factory for obtaining the row writer to be used to extract column values
   *            from items of the [[org.apache.spark.rdd.RDD RDD]]
   */
  def saveAsCassandraTableEx(
    table: TableDef,
    columns: ColumnSelector = AllColumns,
    writeConf: WriteConf = WriteConf.fromSparkConf(sparkContext.getConf),
    tokenRangeAccumulator: Option[TokenRangeAccumulator] = None)(
  implicit
    connector: CassandraConnector = CassandraConnector(sparkContext),
    rwf: RowWriterFactory[T]): Unit = {

    connector.withSessionDo(session => session.execute(table.cql))
    saveToCassandra(table.keyspaceName, table.tableName, columns, writeConf, tokenRangeAccumulator)
  }

  /**
   * Saves the data from [[org.apache.spark.rdd.RDD RDD]] to a new table with definition taken from the
   * `ColumnMapper` for this class.
   *
   * @param keyspaceName keyspace where to create a new table
   * @param tableName name of the table to create; the table must not exist
   * @param columns Selects the columns to save data to.
   *                Uses only the unique column names, and you must select at least all primary key
   *                 columns. All other fields are discarded. Non-selected property/column names are left unchanged.
   *                 This parameter does not affect table creation.
   * @param writeConf additional configuration object allowing to set consistency level, batch size, etc.
   * @param connector optional, implicit connector to Cassandra
   * @param rwf factory for obtaining the row writer to be used to extract column values
   *            from items of the [[org.apache.spark.rdd.RDD RDD]]
   * @param columnMapper a column mapper determining the definition of the table
   */
  def saveAsCassandraTable(
    keyspaceName: String,
    tableName: String,
    columns: ColumnSelector = AllColumns,
    writeConf: WriteConf = WriteConf.fromSparkConf(sparkContext.getConf))(
  implicit
    connector: CassandraConnector = CassandraConnector(sparkContext),
    rwf: RowWriterFactory[T],
    columnMapper: ColumnMapper[T]): Unit = {

    val protocolVersion = connector.withSessionDo(_.getContext.getProtocolVersion)

    val table = TableDef.fromType[T](keyspaceName, tableName, protocolVersion)
    saveAsCassandraTableEx(table, columns, writeConf)
  }

  /**
   * Delete data from Cassandra table, using data from the [[org.apache.spark.rdd.RDD RDD]] as primary keys.
   * Uses the specified column names.
   * @see [[com.datastax.spark.connector.writer.WritableToCassandra]]
   */
  def deleteFromCassandra(
    keyspaceName: String,
    tableName: String,
    deleteColumns: ColumnSelector = SomeColumns(),
    keyColumns: ColumnSelector = PrimaryKeyColumns,
    writeConf: WriteConf = WriteConf.fromSparkConf(sparkContext.getConf))(
  implicit
    connector: CassandraConnector = CassandraConnector(sparkContext),
    rwf: RowWriterFactory[T]): Unit = {
    // column delete require full primary key, partition key is enough otherwise
    val columnDelete = deleteColumns match {
      case c :SomeColumns => c.columns.nonEmpty
      case _  => false
    }

    val writer = TableWriter(connector, keyspaceName, tableName, keyColumns, writeConf, !columnDelete)
    rdd.sparkContext.runJob(rdd, writer.delete(deleteColumns) _)
  }

  /** Applies a function to each item, and groups consecutive items having the same value together.
    * Contrary to `groupBy`, items from the same group must be already next to each other in the
    * original collection. Works locally on each partition, so items from different
    * partitions will never be placed in the same group. */
  def spanBy[U](f: (T) => U): RDD[(U, Iterable[T])] =
    new SpannedRDD[U, T](rdd, f)

  /**
   * Uses the data from [[org.apache.spark.rdd.RDD RDD]] to join with a Cassandra table without
   * retrieving the entire table.
   * Any RDD which can be used to saveToCassandra can be used to joinWithCassandra as well as any
   * RDD which only specifies the partition Key of a Cassandra Table. This method executes single
   * partition requests against the Cassandra Table and accepts the functional modifiers that a
   * normal [[com.datastax.spark.connector.rdd.CassandraTableScanRDD]] takes.
   *
   * By default this method only uses the Partition Key for joining but any combination of columns
   * which are acceptable to C* can be used in the join. Specify columns using joinColumns as a parameter
   * or the on() method.
   *
   * Example With Prior Repartitioning: {{{
   * val source = sc.parallelize(keys).map(x => new KVRow(x))
   * val repart = source.repartitionByCassandraReplica(keyspace, tableName, 10)
   * val someCass = repart.joinWithCassandraTable(keyspace, tableName)
   * }}}
   *
   * Example Joining on Clustering Columns: {{{
   * val source = sc.parallelize(keys).map(x => (x, x * 100))
   * val someCass = source.joinWithCassandraTable(keyspace, wideTable).on(SomeColumns("key", "group"))
   * }}}
   **/
  def joinWithCassandraTable[R](
    keyspaceName: String, tableName: String,
    selectedColumns: ColumnSelector = AllColumns,
    joinColumns: ColumnSelector = PartitionKeyColumns,
    readConf: ReadConf = ReadConf.fromSparkConf(rdd.sparkContext.getConf))(
  implicit 
    connector: CassandraConnector = CassandraConnector(sparkContext),
    newType: ClassTag[R], rrf: RowReaderFactory[R], 
    ev: ValidRDDType[R],
    currentType: ClassTag[T], 
    rwf: RowWriterFactory[T]): CassandraJoinRDD[T, R] = {

    new CassandraJoinRDD[T, R](
      rdd,
      keyspaceName,
      tableName,
      connector,
      columnNames = selectedColumns,
      joinColumns = joinColumns,
      readConf = readConf
    )
  }


  /**
    * Uses the data from [[org.apache.spark.rdd.RDD RDD]] to left join with a Cassandra table without
    * retrieving the entire table.
    * Any RDD which can be used to saveToCassandra can be used to leftJoinWithCassandra as well as any
    * RDD which only specifies the partition Key of a Cassandra Table. This method executes single
    * partition requests against the Cassandra Table and accepts the functional modifiers that a
    * normal [[com.datastax.spark.connector.rdd.CassandraTableScanRDD]] takes.
    *
    * By default this method only uses the Partition Key for joining but any combination of columns
    * which are acceptable to C* can be used in the join. Specify columns using joinColumns as a parameter
    * or the on() method.
    *
    * Example With Prior Repartitioning: {{{
    * val source = sc.parallelize(keys).map(x => new KVRow(x))
    * val repart = source.repartitionByCassandraReplica(keyspace, tableName, 10)
    * val someCass = repart.leftJoinWithCassandraTable(keyspace, tableName)
    * }}}
    *
    * Example Joining on Clustering Columns: {{{
    * val source = sc.parallelize(keys).map(x => (x, x * 100))
    * val someCass = source.leftJoinWithCassandraTable(keyspace, wideTable).on(SomeColumns("key", "group"))
    * }}}
    **/
  def leftJoinWithCassandraTable[R](
    keyspaceName: String, tableName: String,
    selectedColumns: ColumnSelector = AllColumns,
    joinColumns: ColumnSelector = PartitionKeyColumns,
    readConf: ReadConf = ReadConf.fromSparkConf(rdd.sparkContext.getConf))(
  implicit
    connector: CassandraConnector = CassandraConnector(sparkContext),
    newType: ClassTag[R], rrf: RowReaderFactory[R],
    ev: ValidRDDType[R],
    currentType: ClassTag[T],
    rwf: RowWriterFactory[T]): CassandraLeftJoinRDD[T, R] = {

    new CassandraLeftJoinRDD[T, R](
      rdd,
      keyspaceName,
      tableName,
      connector,
      columnNames = selectedColumns,
      joinColumns = joinColumns,
      readConf = readConf
    )
  }


  /**
   * Repartitions the data (via a shuffle) based upon the replication of the given `keyspaceName` and `tableName`.
   * Calling this method before using joinWithCassandraTable will ensure that requests will be coordinator
   * local. `partitionsPerHost` Controls the number of Spark Partitions that will be created in this repartitioning
   * event.
   * The calling RDD must have rows that can be converted into the partition key of the given Cassandra Table.
   **/
  def repartitionByCassandraReplica(
    keyspaceName: String,
    tableName: String,
    partitionsPerHost: Int = 10,
    partitionKeyMapper: ColumnSelector = PartitionKeyColumns)(
  implicit
    connector: CassandraConnector = CassandraConnector(sparkContext),
    currentType: ClassTag[T],
    rwf: RowWriterFactory[T]): CassandraPartitionedRDD[T] = {

    val partitioner = new ReplicaPartitioner[T](
      tableName,
      keyspaceName,
      partitionsPerHost,
      partitionKeyMapper,
      connector)

    val repart = rdd
      .map((_,None))
      .partitionBy(partitioner)
      .mapPartitions(_.map(_._1), preservesPartitioning = true)

    new CassandraPartitionedRDD[T](repart, keyspaceName, tableName)
  }


  /**
   * Key every row in the RDD by with the IP Adresses of all of the Cassandra nodes which a contain a replica
   * of the data specified by that row.
   * The calling RDD must have rows that can be converted into the partition key of the given Cassandra Table.
   */
  def keyByCassandraReplica(
    keyspaceName: String,
    tableName: String,
    partitionKeyMapper: ColumnSelector = PartitionKeyColumns)(
  implicit
    connector: CassandraConnector = CassandraConnector(sparkContext),
    currentType: ClassTag[T],
    rwf: RowWriterFactory[T]): RDD[(Set[InetAddress], T)] = {

    val replicaLocator = ReplicaLocator[T](connector, keyspaceName, tableName, partitionKeyMapper)
    rdd.keyByCassandraReplica(replicaLocator)
  }

  /**
   * A Serializable version of keyByCassandraReplica which removes the implicit
   * RowWriterFactory Dependency
   */
  private[connector] def keyByCassandraReplica(
    replicaLocator: ReplicaLocator[T])(
  implicit
    connector: CassandraConnector,
    currentType: ClassTag[T]): RDD[(Set[InetAddress], T)] = {
    rdd.mapPartitions(primaryKey =>
      replicaLocator.keyByReplicas(primaryKey)
    )
  }

}