com.datastax.spark.connector.writer.WriteConf.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-scylladb-connector_2.12 Show documentation
spark-scylladb-connector
The newest version!
package com.datastax.spark.connector.writer

import com.datastax.oss.driver.api.core.{ConsistencyLevel, DefaultConsistencyLevel}
import com.datastax.oss.driver.api.core.`type`.{DataType, DataTypes}
import com.datastax.spark.connector.cql.{ColumnDef, RegularColumn}
import com.datastax.spark.connector.types.ColumnType
import com.datastax.spark.connector.util.ConfigCheck.ConnectorConfigurationException
import com.datastax.spark.connector.util.{ConfigCheck, ConfigParameter, DeprecatedConfigParameter}
import com.datastax.spark.connector.{BatchSize, BytesInBatch, RowsInBatch}
import org.apache.spark.SparkConf

/** Write settings for RDD
  *
  * @param batchSize approx. number of bytes to be written in a single batch or
  *                  exact number of rows to be written in a single batch;
  * @param batchGroupingBufferSize the number of distinct batches that can be buffered before
  *                        they are written to Cassandra
  * @param batchGroupingKey which rows can be grouped into a single batch
  * @param consistencyLevel consistency level for writes, default LOCAL_QUORUM
  * @param ifNotExists inserting a row should happen only if it does not already exist
  * @param parallelismLevel number of batches to be written in parallel
  * @param ttl       the default TTL value which is used when it is defined (in seconds)
  * @param timestamp the default timestamp value which is used when it is defined (in microseconds)
  * @param taskMetricsEnabled whether or not enable task metrics updates (requires Spark 1.2+)
  */

case class WriteConf(
  batchSize: BatchSize = BatchSize.Automatic,
  batchGroupingBufferSize: Int = WriteConf.BatchBufferSizeParam.default,
  batchGroupingKey: BatchGroupingKey = WriteConf.BatchLevelParam.default,
  consistencyLevel: ConsistencyLevel = WriteConf.ConsistencyLevelParam.default,
  ifNotExists: Boolean = WriteConf.IfNotExistsParam.default,
  ignoreNulls: Boolean = WriteConf.IgnoreNullsParam.default,
  parallelismLevel: Int = WriteConf.ParallelismLevelParam.default,
  throughputMiBPS: Option[Double] = WriteConf.ThroughputMiBPSParam.default,
  ttl: TTLOption = TTLOption.defaultValue,
  timestamp: TimestampOption = TimestampOption.defaultValue,
  taskMetricsEnabled: Boolean = WriteConf.TaskMetricsParam.default,
  executeAs: Option[String] = None) {

  private[writer] val optionPlaceholders: Seq[String] = Seq(ttl, timestamp).collect {
    case WriteOption(PerRowWriteOptionValue(placeholder)) => placeholder
  }

  private[writer] val optionsAsColumns: (String, String) => Seq[ColumnDef] = { (keyspace, table) =>
    def toRegularColDef(opt: WriteOption[_], dataType: DataType) = opt match {
      case WriteOption(PerRowWriteOptionValue(placeholder)) =>
        Some(ColumnDef(placeholder, RegularColumn, ColumnType.fromDriverType(dataType)))
      case _ => None
    }

    Seq(toRegularColDef(ttl, DataTypes.INT), toRegularColDef(timestamp, DataTypes.BIGINT)).flatten
  }

  val throttlingEnabled = throughputMiBPS.isDefined
}


object WriteConf {

  val ReferenceSection = "Write Tuning Parameters"

  val ConsistencyLevelParam = ConfigParameter[ConsistencyLevel](
    name = "spark.cassandra.output.consistency.level",
    section = ReferenceSection,
    default = DefaultConsistencyLevel.LOCAL_QUORUM,
    description = """Consistency level for writing""")

  val BatchSizeRowsParam = ConfigParameter[Option[Int]](
    name = "spark.cassandra.output.batch.size.rows",
    section = ReferenceSection,
    default = None,
    description = """Number of rows per single batch. The default is 'auto'
      |which means the connector will adjust the number
      |of rows based on the amount of data
      |in each row""".stripMargin)

  val BatchSizeBytesParam = ConfigParameter[Int](
    name = "spark.cassandra.output.batch.size.bytes",
    section = ReferenceSection,
    default = 1024,
    description = s"""Maximum total size of the batch in bytes. Overridden by
      |${BatchSizeRowsParam.name}
    """.stripMargin)

  val BatchBufferSizeParam = ConfigParameter[Int](
    name = "spark.cassandra.output.batch.grouping.buffer.size",
    section = ReferenceSection,
    default = 1000,
    description = """ How many batches per single Spark task can be stored in
      |memory before sending to Cassandra""".stripMargin)


  val BatchLevelParam = ConfigParameter[BatchGroupingKey](
    name = "spark.cassandra.output.batch.grouping.key",
    section = ReferenceSection,
    default  = BatchGroupingKey.Partition,
    description = """Determines how insert statements are grouped into batches. Available values are
    |
    |    none  : a batch may contain any statements 
    |    replica_set  : a batch may contain only statements to be written to the same replica set 
    |    partition  : a batch may contain only statements for rows sharing the same partition key value 
    |
    |""".stripMargin)

  val IfNotExistsParam = ConfigParameter[Boolean](
    name = "spark.cassandra.output.ifNotExists",
    section = ReferenceSection,
    default = false,
    description =
      """Determines that the INSERT operation is not performed if a row with the same primary
        				|key already exists. Using the feature incurs a performance hit.""".stripMargin)

  val IgnoreNullsParam = ConfigParameter[Boolean](
    name = "spark.cassandra.output.ignoreNulls",
    section = ReferenceSection,
    default = false,
    description =
      """ In Cassandra >= 2.2 null values can be left as unset in bound statements. Setting
        |this to true will cause all null values to be left as unset rather than bound. For
        |finer control see the CassandraOption class""".stripMargin)

  val ParallelismLevelParam = ConfigParameter[Int] (
    name = "spark.cassandra.output.concurrent.writes",
    section = ReferenceSection,
    default = 5,
    description = """Maximum number of batches executed in parallel by a
      | single Spark task""".stripMargin)
  
  val ThroughputMiBPSParam = ConfigParameter[Option[Double]] (
    name = "spark.cassandra.output.throughputMBPerSec",
    section = ReferenceSection,
    default = None,
    description = """*(Floating points allowed)* 
 Maximum write throughput allowed
      | per single core in MB/s. 
 Limit this on long (+8 hour) runs to 70% of your max throughput
      | as seen on a smaller job for stability""".stripMargin)

  val DeprecatedThroughputMiBPSParam = DeprecatedConfigParameter(
    name = "spark.cassandra.output.throughput_mb_per_sec",
    replacementParameter = Some(ThroughputMiBPSParam),
    deprecatedSince = "DSE 6.0.0"
  )

  val TTLParam = ConfigParameter[Int] (
    name = "spark.cassandra.output.ttl",
    section = ReferenceSection,
    default = 0,
    description = """Time To Live(TTL) assigned to writes to Cassandra. A value of 0 means no TTL""".stripMargin)

  val TimestampParam = ConfigParameter[Long](
    name = "spark.cassandra.output.timestamp",
    section = ReferenceSection,
    default = 0,
    description =
      """Timestamp (microseconds since epoch) of the write. If not specified, the time that the
        | write occurred is used. A value of 0 means time of write.""".stripMargin)

  /** Task Metrics **/
  val TaskMetricsParam = ConfigParameter[Boolean](
    name = "spark.cassandra.output.metrics",
    section = ReferenceSection,
    default = true,
    description = """Sets whether to record connector specific metrics on write"""
  )

  def fromSparkConf(conf: SparkConf): WriteConf = {

    ConfigCheck.checkConfig(conf)

    val batchSizeInBytes = conf.getInt(BatchSizeBytesParam.name, BatchSizeBytesParam.default)

    val consistencyLevel = DefaultConsistencyLevel.valueOf(
      conf.get(ConsistencyLevelParam.name, ConsistencyLevelParam.default.name()))

    val batchSizeInRowsStr = conf.get(BatchSizeRowsParam.name, "auto")

    val ifNotExists = conf.getBoolean(IfNotExistsParam.name, IfNotExistsParam.default)

    val ignoreNulls = conf.getBoolean(IgnoreNullsParam.name, IgnoreNullsParam.default)

    val batchSize = {
      val Number = "([0-9]+)".r
      batchSizeInRowsStr match {
        case "auto" => BytesInBatch(batchSizeInBytes)
        case Number(x) => RowsInBatch(x.toInt)
        case other =>
          throw new ConnectorConfigurationException(
            s"Invalid value of spark.cassandra.output.batch.size.rows: $other. Number or 'auto' expected")
      }
    }

    val batchBufferSize = conf.getInt(BatchBufferSizeParam.name, BatchBufferSizeParam.default)

    val batchGroupingKey = conf.getOption(BatchLevelParam.name)
      .map(BatchGroupingKey.apply)
      .getOrElse(BatchLevelParam.default)

    val parallelismLevel = conf.getInt(ParallelismLevelParam.name, ParallelismLevelParam.default)

    val throughputMiBPS = conf.getOption(ThroughputMiBPSParam.name).map(_.toDouble)

    val metricsEnabled = conf.getBoolean(TaskMetricsParam.name, TaskMetricsParam.default)

    val ttlSeconds = conf.getInt(TTLParam.name, TTLParam.default)

    val ttlOption =
      if (ttlSeconds == TTLParam.default)
        TTLOption.defaultValue
      else
        TTLOption.constant(ttlSeconds)
    
    val timestampMicros = conf.getLong(TimestampParam.name, TimestampParam.default)

    val timestampOption =
      if (timestampMicros == TimestampParam.default)
        TimestampOption.defaultValue
      else
        TimestampOption.constant(timestampMicros)

    WriteConf(
      batchSize = batchSize,
      batchGroupingBufferSize = batchBufferSize,
      batchGroupingKey = batchGroupingKey,
      consistencyLevel = consistencyLevel,
      parallelismLevel = parallelismLevel,
      throughputMiBPS = throughputMiBPS,
      taskMetricsEnabled = metricsEnabled,
      ttl = ttlOption,
      timestamp = timestampOption,
      ignoreNulls = ignoreNulls,
      ifNotExists = ifNotExists)
  }

}