All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.spark.connector.rdd.CassandraRDD.scala Maven / Gradle / Ivy

The newest version!
package com.datastax.spark.connector.rdd

import com.datastax.spark.connector.cql._
import com.datastax.spark.connector.rdd.ClusteringOrder.{Ascending, Descending}
import com.datastax.spark.connector.rdd.reader._
import com.datastax.spark.connector.types.TypeConverter
import com.datastax.spark.connector.util.ConfigCheck
import com.datastax.spark.connector.{ColumnSelector, SomeColumns, _}
import org.apache.spark.rdd.RDD
import org.apache.spark.{Dependency, SparkContext}

import scala.language.existentials
import scala.reflect.ClassTag

abstract class CassandraRDD[R : ClassTag](
    sc: SparkContext,
    dep: Seq[Dependency[_]])
  extends RDD[R](sc, dep) {

  /** This is slightly different than Scala this.type.
    * this.type is the unique singleton type of an object which is not compatible with other
    * instances of the same type, so returning anything other than `this` is not really possible
    * without lying to the compiler by explicit casts.
    * Here SelfType is used to return a copy of the object - a different instance of the same type */
  type Self <: CassandraRDD[R]
  
  ConfigCheck.checkConfig(sc.getConf)


  protected[connector] def keyspaceName: String

  protected[connector] def tableName: String

  protected def columnNames: ColumnSelector

  protected def where: CqlWhereClause

  protected def readConf: ReadConf

  protected def limit: Option[CassandraLimit]

  protected def clusteringOrder: Option[ClusteringOrder]

  protected def connector: CassandraConnector

  def toEmptyCassandraRDD: EmptyCassandraRDD[R]

  /** Counts the number of items in this RDD by selecting count(*) on Cassandra table */
  def cassandraCount(): Long

  /** Allows to copy this RDD with changing some of the properties */
  protected def copy(
    columnNames: ColumnSelector = columnNames,
    where: CqlWhereClause = where,
    limit: Option[CassandraLimit] = limit,
    clusteringOrder: Option[ClusteringOrder] = None,
    readConf: ReadConf = readConf,
    connector: CassandraConnector = connector): Self

  /** Allows to set custom read configuration, e.g. consistency level or fetch size. */
  def withReadConf(readConf: ReadConf): Self =
    copy(readConf = readConf)
  
  /** Returns a copy of this Cassandra RDD with specified connector */
  def withConnector(connector: CassandraConnector): Self = {
    copy(connector = connector)
  }

  /** Adds a CQL `WHERE` predicate(s) to the query.
    * Useful for leveraging secondary indexes in Cassandra.
    * Implicitly adds an `ALLOW FILTERING` clause to the WHERE clause, 
    * however beware that some predicates might be rejected by Cassandra, 
    * particularly in cases when they filter on an unindexed, non-clustering column. */
  def where(cql: String, values: Any*): Self = {
    copy(where = where and CqlWhereClause(Seq(cql), values))
  }

  /** Narrows down the selected set of columns.
    * Use this for better performance, when you don't need all the columns in the result RDD.
    * When called multiple times, it selects the subset of the already selected columns, so
    * after a column was removed by the previous `select` call, it is not possible to
    * add it back.
    *
    * The selected columns are [[ColumnRef]] instances. This type allows to specify columns for
    * straightforward retrieval and to read TTL or write time of regular columns as well. Implicit
    * conversions included in [[com.datastax.spark.connector]] package make it possible to provide
    * just column names (which is also backward compatible) and optional add `.ttl` or `.writeTime`
    * suffix in order to create an appropriate [[ColumnRef]] instance.
    */
  def select(columns: ColumnRef*): Self = {
    copy(columnNames = SomeColumns(narrowColumnSelection(columns): _*))
  }

  /** Adds the limit clause to CQL select statement. The limit will be applied for each created
    * Spark partition. In other words, unless the data are fetched from a single Cassandra partition
    * the number of results is unpredictable.
    *
    * The main purpose of passing limit clause is to fetch top n rows from a single Cassandra
    * partition when the table is designed so that it uses clustering keys and a partition key
    * predicate is passed to the where clause. */
  def limit(rowLimit: Long): Self = {
    copy(limit = Some(SparkPartitionLimit(rowLimit)))
  }

  /** Adds the PER PARTITION LIMIT clause to CQL select statement. The limit will be applied for
    * every Cassandra Partition. Only Valid For Cassandra 3.6+ */
  def perPartitionLimit(rowLimit: Long): Self = {
    copy(limit = Some(CassandraPartitionLimit(rowLimit)))
  }

  /** Adds a CQL `ORDER BY` clause to the query.
    * It can be applied only in case there are clustering columns and primary key predicate is
    * pushed down in `where`.
    * It is useful when the default direction of ordering rows within a single Cassandra partition
    * needs to be changed. */
  def clusteringOrder(order: ClusteringOrder): Self = {
    copy(clusteringOrder = Some(order))
  }

  def withAscOrder: Self = clusteringOrder(Ascending)
  
  def withDescOrder: Self = clusteringOrder(Descending)

  override def take(num: Int): Array[R] = {
    limit match {
      case Some(_) => super.take(num)
      case None => limit(num).take(num)
    }
  }

  protected def narrowColumnSelection(columns: Seq[ColumnRef]): Seq[ColumnRef]

  // Needed to be public for JavaAPI
  val selectedColumnRefs: Seq[ColumnRef]

  def selectedColumnNames: Seq[String] = selectedColumnRefs.map(_.cqlValueName)

  // convertTo must be implemented for classes which wish to support `.as`
  protected def convertTo[B : ClassTag : RowReaderFactory]: CassandraRDD[B] =
    throw new NotImplementedError(s"convertTo not implemented for this class")

  /** Maps each row into object of a different type using provided function taking column
    * value(s) as argument(s). Can be used to convert each row to a tuple or a case class object:
    * {{{
    * sc.cassandraTable("ks", "table")
    *   .select("column1")
    *   .as((s: String) => s)                 // yields CassandraRDD[String]
    *
    * sc.cassandraTable("ks", "table")
    *   .select("column1", "column2")
    *   .as((_: String, _: Long))             // yields CassandraRDD[(String, Long)]
    *
    * case class MyRow(key: String, value: Long)
    * sc.cassandraTable("ks", "table")
    *   .select("column1", "column2")
    *   .as(MyRow)                            // yields CassandraRDD[MyRow]
    * }}} */
  def as[B: ClassTag, A0: TypeConverter](f: A0 => B): CassandraRDD[B] = {
    implicit val ft = new FunctionBasedRowReader1(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter](f: (A0, A1) => B): CassandraRDD[B] = {
    implicit val ft = new FunctionBasedRowReader2(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter]
    (f: (A0, A1, A2) => B): CassandraRDD[B] = {

    implicit val ft = new FunctionBasedRowReader3(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter,
         A3: TypeConverter](f: (A0, A1, A2, A3) => B) = {
    implicit val ft = new FunctionBasedRowReader4(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
  A4: TypeConverter](f: (A0, A1, A2, A3, A4) => B) = {
    implicit val ft = new FunctionBasedRowReader5(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
  A4: TypeConverter, A5: TypeConverter](f: (A0, A1, A2, A3, A4, A5) => B) = {
    implicit val ft = new FunctionBasedRowReader6(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
  A4: TypeConverter, A5: TypeConverter, A6: TypeConverter](f: (A0, A1, A2, A3, A4, A5, A6) => B) = {
    implicit val ft = new FunctionBasedRowReader7(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
  A4: TypeConverter, A5: TypeConverter, A6: TypeConverter,
  A7: TypeConverter](f: (A0, A1, A2, A3, A4, A5, A6, A7) => B) = {
    implicit val ft = new FunctionBasedRowReader8(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
  A4: TypeConverter, A5: TypeConverter, A6: TypeConverter, A7: TypeConverter,
  A8: TypeConverter](f: (A0, A1, A2, A3, A4, A5, A6, A7, A8) => B) = {
    implicit val ft = new FunctionBasedRowReader9(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
  A4: TypeConverter, A5: TypeConverter, A6: TypeConverter, A7: TypeConverter,
  A8: TypeConverter, A9: TypeConverter](f: (A0, A1, A2, A3, A4, A5, A6, A7, A8, A9) => B) = {
    implicit val ft = new FunctionBasedRowReader10(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
  A4: TypeConverter, A5: TypeConverter, A6: TypeConverter, A7: TypeConverter, A8: TypeConverter,
  A9: TypeConverter, A10: TypeConverter](f: (A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10) => B) = {
    implicit val ft = new FunctionBasedRowReader11(f)
    convertTo[B]
  }

  def as[B: ClassTag, A0: TypeConverter, A1: TypeConverter, A2: TypeConverter, A3: TypeConverter,
         A4: TypeConverter, A5: TypeConverter, A6: TypeConverter, A7: TypeConverter, A8: TypeConverter,
         A9: TypeConverter, A10: TypeConverter, A11: TypeConverter](
    f: (A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11) => B) = {

    implicit val ft = new FunctionBasedRowReader12(f)
    convertTo[B]
  }
}

object CassandraRDD {
  def apply[T](sc: SparkContext, keyspaceName: String, tableName: String)
              (implicit ct: ClassTag[T], rrf: RowReaderFactory[T]): CassandraRDD[T] = {

    new CassandraTableScanRDD[T](
      sc,
      CassandraConnector(sc),
      keyspaceName,
      tableName,
      AllColumns,
      CqlWhereClause.empty,
      None,
      None,
      ReadConf.fromSparkConf(sc.getConf)
    )
  }

  def apply[K, V](sc: SparkContext, keyspaceName: String, tableName: String)
                 (implicit keyCT: ClassTag[K], valueCT: ClassTag[V], rrf: RowReaderFactory[(K, V)]): CassandraRDD[(K, V)] =

    new CassandraTableScanRDD[(K, V)](
      sc,
      CassandraConnector(sc),
      keyspaceName,
      tableName,
      AllColumns,
      CqlWhereClause.empty,
      None,
      None,
      ReadConf.fromSparkConf(sc.getConf)
    )
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy