All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.spark.connector.rdd.SpannedRDD.scala Maven / Gradle / Ivy

package com.datastax.spark.connector.rdd

import org.apache.spark.{TaskContext, Partition}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD

import com.datastax.spark.connector.util.SpanningIterator

/**
 * Groups items with the same key, assuming items with the same key are next to each other in
 * the parent collection. Contrary to Spark GroupedRDD, it does not perform shuffle, therefore it
 * is much faster. A key for each item is obtained by calling a given function.
 *
 * This RDD is very useful for grouping data coming out from Cassandra, because they are already
 * coming in order of partitioning key i.e. it is not possible for two rows
 * with the same partition key to be in different Spark partitions.
 *
 * @param parent parent RDD
 * @tparam K type of keys
 * @tparam T type of elements to be grouped together
 */
private[connector] class SpannedRDD[K, T](parent: RDD[T], f: T => K)
  extends RDD[(K, Iterable[T])](parent) {

  override protected def getPartitions = parent.partitions

  @DeveloperApi
  override def compute(split: Partition, context: TaskContext) =
    new SpanningIterator(parent.iterator(split, context), f)

}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy