com.datastax.spark.connector.rdd.ReadConf.scala Maven / Gradle / Ivy

Go to download
package com.datastax.spark.connector.rdd

import com.datastax.oss.driver.api.core.{ConsistencyLevel, DefaultConsistencyLevel}
import com.datastax.spark.connector.util.{ConfigCheck, ConfigParameter, DeprecatedConfigParameter, Logging}
import com.datastax.spark.connector.writer.WriteConf.{ReferenceSection, ThroughputMiBPSParam}
import org.apache.spark.SparkConf

/** Read settings for RDD
  *
  * @param splitCount number of partitions to divide the data into; unset by default
  * @param splitSizeInMB size of Cassandra data to be read in a single Spark task; 
  *                      determines the number of partitions, but ignored if `splitCount` is set
  * @param fetchSizeInRows number of CQL rows to fetch in a single round-trip to Cassandra
  * @param consistencyLevel consistency level for reads, default LOCAL_ONE;
  *                         higher consistency level will disable data-locality
  * @param taskMetricsEnabled whether or not enable task metrics updates (requires Spark 1.2+)
  * @param readsPerSec maximum read throughput allowed per single core in requests/s while
  *                                  joining an RDD with C* table (joinWithCassandraTable operation)
  *                                  also used by enterprise integrations*/
case class ReadConf(
  splitCount: Option[Int] = None,
  splitSizeInMB: Int = ReadConf.SplitSizeInMBParam.default,
  fetchSizeInRows: Int = ReadConf.FetchSizeInRowsParam.default,
  consistencyLevel: ConsistencyLevel = ReadConf.ConsistencyLevelParam.default,
  taskMetricsEnabled: Boolean = ReadConf.TaskMetricParam.default,
  throughputMiBPS: Option[Double] = None,
  readsPerSec: Option[Int] = ReadConf.ReadsPerSecParam.default,
  parallelismLevel: Int = ReadConf.ParallelismLevelParam.default,
  executeAs: Option[String] = None)


object ReadConf extends Logging {
  val ReferenceSection = "Read Tuning Parameters"

  val ThroughputMiBPSParam = ConfigParameter[Option[Double]] (
    name = "spark.cassandra.input.throughputMBPerSec",
    section = ReferenceSection,
    default = None,
    description = """*(Floating points allowed)* 
 Maximum read throughput allowed
                    | per single core in MB/s. Effects point lookups as well as full
                    | scans.""".stripMargin)

  val SplitSizeInMBParam = ConfigParameter[Int](
    name = "spark.cassandra.input.split.sizeInMB",
    section = ReferenceSection,
    default = 512,
    description =
      """Approx amount of data to be fetched into a Spark partition. Minimum number of resulting Spark
        | partitions is 1 + 2 * SparkContext.defaultParallelism
        |""".stripMargin.filter(_ >= ' '))

  val DeprecatedSplitSizeInMBParam = DeprecatedConfigParameter(
    name = "spark.cassandra.input.split.size_in_mb",
    replacementParameter = Some(SplitSizeInMBParam),
    deprecatedSince = "DSE 6.0.0"
  )

  val FetchSizeInRowsParam = ConfigParameter[Int](
    name = "spark.cassandra.input.fetch.sizeInRows",
    section = ReferenceSection,
    default = 1000,
    description = """Number of CQL rows fetched per driver request""")

  val DeprecatedFetchSizeInRowsParam = DeprecatedConfigParameter(
    name = "spark.cassandra.input.fetch.size_in_rows",
    replacementParameter = Some(FetchSizeInRowsParam),
    deprecatedSince = "DSE 6.0.0"
  )

  val ConsistencyLevelParam = ConfigParameter[ConsistencyLevel](
    name = "spark.cassandra.input.consistency.level",
    section = ReferenceSection,
    default = DefaultConsistencyLevel.LOCAL_ONE,
    description = """Consistency level to use when reading	""")

  val TaskMetricParam = ConfigParameter[Boolean](
    name = "spark.cassandra.input.metrics",
    section = ReferenceSection,
    default = true,
    description = """Sets whether to record connector specific metrics on write"""
  )

  val ReadsPerSecParam = ConfigParameter[Option[Int]] (
    name = "spark.cassandra.input.readsPerSec",
    section = ReferenceSection,
    default = None,
    description =
      """Sets max requests or pages per core per second, unlimited by default."""
  )

  val DeprecatedReadsPerSecParam = DeprecatedConfigParameter(
    name = "spark.cassandra.input.reads_per_sec",
    replacementParameter = Some(ReadsPerSecParam),
    deprecatedSince = "DSE 6.0.0"
  )

  val ThroughputJoinQueryPerSecParam = DeprecatedConfigParameter (
    name = "spark.cassandra.input.join.throughput_query_per_sec",
    replacementParameter = Some(ReadsPerSecParam),
    deprecatedSince = "DSE 5.1.5")


  val ParallelismLevelParam = ConfigParameter[Int] (
    name = "spark.cassandra.concurrent.reads",
    section = ReferenceSection,
    default = 512,
    description =
      """Sets read parallelism for joinWithCassandra tables"""
  )

  def fromSparkConf(conf: SparkConf): ReadConf = {

    ConfigCheck.checkConfig(conf)

    ReadConf(
      fetchSizeInRows = conf.getInt(FetchSizeInRowsParam.name, FetchSizeInRowsParam.default),
      splitSizeInMB = conf.getInt(SplitSizeInMBParam.name, SplitSizeInMBParam.default),
      consistencyLevel = DefaultConsistencyLevel.valueOf(
        conf.get(ConsistencyLevelParam.name, ConsistencyLevelParam.default.name)),
      taskMetricsEnabled = conf.getBoolean(TaskMetricParam.name, TaskMetricParam.default),
      throughputMiBPS = conf.getOption(ThroughputMiBPSParam.name).map(_.toDouble),
      readsPerSec = conf.getOption(ReadsPerSecParam.name).map(_.toInt),
      parallelismLevel = conf.getInt(ParallelismLevelParam.name, ParallelismLevelParam.default)
    )
  }

}