org.apache.spark.sql.cassandra.CassandraSourceRelation.scala Maven / Gradle / Ivy
The newest version!
package org.apache.spark.sql.cassandra
import com.datastax.spark.connector.cql.TableDef
import com.datastax.spark.connector.datasource.{CassandraScan, CassandraTable}
import com.datastax.spark.connector.util._
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.security.UserGroupInformation
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanRelation}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import scala.jdk.CollectionConverters._
sealed trait DirectJoinSetting
case object AlwaysOn extends DirectJoinSetting
case object AlwaysOff extends DirectJoinSetting
case object Automatic extends DirectJoinSetting
sealed trait DseSearchOptimizationSetting { def enabled = false }
case object On extends DseSearchOptimizationSetting {
override def enabled = true
override def toString: String = "on"
}
case object Off extends DseSearchOptimizationSetting {
override def toString: String = "off"
}
case class Auto(ratio: Double) extends DseSearchOptimizationSetting {
override def enabled = true
override def toString: String = "auto"
}
trait CassandraTableDefProvider {
def tableDef: TableDef
def withSparkConfOption(key: String, value: String): BaseRelation
}
object CassandraSourceRelation extends Logging {
private lazy val hiveConf = new HiveConf()
val ReferenceSection = "Cassandra Datasource Parameters"
val TableOptions = "Cassandra Datasource Table Options"
val TableSizeInBytesParam = ConfigParameter[Option[Long]](
name = "spark.cassandra.table.size.in.bytes",
section = ReferenceSection,
default = None,
description =
"""Used by DataFrames Internally, will be updated in a future release to
|retrieve size from Cassandra. Can be set manually now""".stripMargin
)
val WriteTimeParam = ConfigParameter[Option[String]](
name = "writetime",
section = TableOptions,
default = None,
description =
"""Surfaces the Cassandra Row Writetime as a Column
|with the named specified. When reading use writetime.columnName=aliasForWritetime. This
|can be done for every column with a writetime. When Writing use writetime=columnName and the
|columname will be used to set the writetime for that row.""".stripMargin
)
val TTLParam = ConfigParameter[Option[String]](
name = "ttl",
section = TableOptions,
default = None,
description =
"""Surfaces the Cassandra Row TTL as a Column
|with the named specified. When reading use ttl.columnName=aliasForTTL. This
|can be done for every column with a TTL. When writing use writetime=columnName and the
|columname will be used to set the TTL for that row.""".stripMargin
)
val AdditionalCassandraPushDownRulesParam = ConfigParameter[List[CassandraPredicateRules]](
name = "spark.cassandra.sql.pushdown.additionalClasses",
section = ReferenceSection,
default = List.empty,
description =
"""A comma separated list of classes to be used (in order) to apply additional
| pushdown rules for Cassandra Dataframes. Classes must implement CassandraPredicateRules
""".stripMargin
)
val SearchPredicateOptimizationRatioParam = ConfigParameter[Double](
name = "spark.sql.dse.search.autoRatio",
section = ReferenceSection,
default = 0.03,
description = "When Search Predicate Optimization is set to auto, Search optimizations will be " +
"preformed if this parameter * the total number of rows is greater than the number of rows " +
"to be returned by the solr query"
)
val SearchPredicateOptimizationParam = ConfigParameter[String](
name = "spark.sql.dse.search.enableOptimization",
section = ReferenceSection,
default = "auto",
description =
s"""Enables SparkSQL to automatically replace Cassandra Pushdowns with DSE Search
|Pushdowns utilizing lucene indexes. Valid options are On, Off, and Auto. Auto enables
|optimizations when the solr query will pull less than ${SearchPredicateOptimizationRatioParam.name} * the
|total table record count""".stripMargin
)
val SolrPredciateOptimizationParam = DeprecatedConfigParameter(
name = "spark.sql.dse.solr.enable_optimization",
replacementParameter = Some(SearchPredicateOptimizationParam),
deprecatedSince = "DSE 6.0.0"
)
val DirectJoinSizeRatioParam = ConfigParameter[Double](
name = "directJoinSizeRatio",
section = TableOptions,
default = 0.9d,
description =
s"""
| Sets the threshold on when to perform a DirectJoin in place of a full table scan. When
| the size of the (CassandraSource * thisParameter) > The other side of the join, A direct
| join will be performed if possible.
""".stripMargin
)
val DirectJoinSettingParam = ConfigParameter[String](
name = "directJoinSetting",
section = TableOptions,
default = "auto",
description =
s"""Acceptable values, "on", "off", "auto"
|"on" causes a direct join to happen if possible regardless of size ratio.
|"off" disables direct join even when possible
|"auto" only does a direct join when the size ratio is satisfied see ${DirectJoinSizeRatioParam.name}
""".stripMargin
)
val InClauseToJoinWithTableConversionThreshold = ConfigParameter[Long](
name = "spark.cassandra.sql.inClauseToJoinConversionThreshold",
section = ReferenceSection,
default = 2500L,
description =
s"""Queries with `IN` clause(s) are converted to JoinWithCassandraTable operation if the size of cross
|product of all `IN` value sets exceeds this value. To disable `IN` clause conversion, set this setting to 0.
|Query `select * from t where k1 in (1,2,3) and k2 in (1,2) and k3 in (1,2,3,4)` has 3 sets of `IN` values.
|Cross product of these values has size of 24.
""".stripMargin
)
val DseInClauseToJoinWithTableConversionThreshold = DeprecatedConfigParameter[Long](
name = "spark.sql.dse.inClauseToJoinConversionThreshold",
replacementParameter = Some(InClauseToJoinWithTableConversionThreshold),
deprecatedSince = "3.0.0",
rational = "Renamed since this is no longer DSE Specific"
)
val InClauseToFullTableScanConversionThreshold = ConfigParameter[Long](
name = "spark.cassandra.sql.inClauseToFullScanConversionThreshold",
section = ReferenceSection,
default = 20000000L,
description =
s"""Queries with `IN` clause(s) are not converted to JoinWithCassandraTable operation if the size of cross
|product of all `IN` value sets exceeds this value. It is meant to stop conversion for huge `IN` values sets
|that may cause memory problems. If this limit is exceeded full table scan is performed.
|This setting takes precedence over ${InClauseToJoinWithTableConversionThreshold.name}.
|Query `select * from t where k1 in (1,2,3) and k2 in (1,2) and k3 in (1,2,3,4)` has 3 sets of `IN` values.
|Cross product of these values has size of 24.
""".stripMargin
)
val DseInClauseToFullTableScanConversionThreshold = DeprecatedConfigParameter[Long](
name = "spark.sql.dse.inClauseToFullScanConversionThreshold",
replacementParameter = Some(InClauseToFullTableScanConversionThreshold),
deprecatedSince = "3.0.0",
rational = "Renamed because this is no longer DSE Specific"
)
val IgnoreMissingMetaColumns = ConfigParameter[Boolean](
name = "ignoreMissingMetaColumns",
section = TableOptions,
default = false,
description =
s"""Acceptable values, "true", "false"
|"true" ignore missing meta properties
|"false" throw error if missing property is requested
""".stripMargin
)
private val proxyPerSourceRelationEnabled = sys.env.getOrElse("DSE_ENABLE_PROXY_PER_SRC_RELATION", "false").toBoolean
private def getProxyUser(sqlContext: SQLContext): Option[String] = {
val doAsEnabled = hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS)
val authenticationEnabled = hiveConf.getVar(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION).toUpperCase match {
case "NONE" => false
case _ => true
}
lazy val user = UserGroupInformation.getCurrentUser
if (doAsEnabled && authenticationEnabled) {
Some(user.getUserName)
} else {
None
}
}
def setDirectJoin[K: Encoder](ds: Dataset[K], directJoinSetting: DirectJoinSetting = AlwaysOn): Dataset[K] = {
val oldPlan = ds.queryExecution.logical
Dataset[K](ds.sparkSession,
oldPlan.transform {
case ds@DataSourceV2Relation(_: CassandraTable, _, _, _, options) =>
ds.copy(options = applyDirectJoinSetting(options, directJoinSetting))
case ds@DataSourceV2ScanRelation(_: CassandraTable, scan: CassandraScan, _, _, _) =>
ds.copy(scan = scan.copy(consolidatedConf = applyDirectJoinSetting(scan.consolidatedConf, directJoinSetting)))
}
)
}
val defaultClusterName = "default"
def directJoinSettingToString(directJoinSetting: DirectJoinSetting) = directJoinSetting match {
case AlwaysOn => "on"
case AlwaysOff => "off"
case Automatic => "auto"
}
def applyDirectJoinSetting(options: CaseInsensitiveStringMap, directJoinSetting: DirectJoinSetting) : CaseInsensitiveStringMap = {
val value = directJoinSettingToString(directJoinSetting)
new CaseInsensitiveStringMap((Map(DirectJoinSettingParam.name -> value) ++ options.asScala).asJava)
}
def applyDirectJoinSetting(sparkConf: SparkConf, directJoinSetting: DirectJoinSetting) : SparkConf = {
val value = directJoinSettingToString(directJoinSetting)
sparkConf.set(DirectJoinSettingParam.name, value)
}
def getDirectJoinSetting(conf: SparkConf) = {
conf
.get(DirectJoinSettingParam.name, DirectJoinSettingParam.default)
.toLowerCase() match {
case "auto" => Automatic
case "on" => AlwaysOn
case "off" => AlwaysOff
case invalid => throw new IllegalArgumentException(
s"""
|$invalid is not a valid ${DirectJoinSettingParam.name} value.
|${DirectJoinSettingParam.description}""".stripMargin)
}
}
}
object SolrConstants {
val DseSolrIndexClassName = "com.datastax.bdp.search.solr.Cql3SolrSecondaryIndex"
val SolrQuery = "solr_query"
}