org.apache.spark.sql.cassandra.DsePredicateRules.scala Maven / Gradle / Ivy

Go to download
/**
  * Copyright DataStax, Inc.
  *
  * Please see the included license file for details.
  */
package org.apache.spark.sql.cassandra

import com.datastax.spark.connector.cql.{IndexDef, TableDef}
import com.datastax.spark.connector.types._
import com.datastax.spark.connector.util.Logging
import org.apache.spark.SparkConf
import org.apache.spark.sql.cassandra.PredicateOps.FilterOps
import org.apache.spark.sql.sources.{EqualTo, Filter, IsNotNull}
import scala.collection.compat._


/**
  * A series of pushdown rules that only apply when connecting to Datastax Enterprise
  */
object DsePredicateRules extends CassandraPredicateRules with Logging {

  val StorageAttachedIndex = "StorageAttachedIndex"

  private type PredicateRule = (AnalyzedPredicates, TableDef) => AnalyzedPredicates
  private val saiTextTypes = Set[ColumnType[_]](TextType, AsciiType, VarCharType, UUIDType)
  private val saiNumericTypes = Set[ColumnType[_]](IntType, BigIntType, SmallIntType, TinyIntType, FloatType, DoubleType,
    VarIntType, DecimalType, TimestampType, DateType, InetType, TimeType, TimestampType, TimeUUIDType)

  override def apply(
    predicates: AnalyzedPredicates,
    tableDef: TableDef,
    sparkConf: SparkConf): AnalyzedPredicates = {

    val pushDownRules = Seq[PredicateRule](
      pushOnlySolrQuery
    ) ++ storageAttachedIndexPredicateRules(predicates, tableDef)

    pushDownRules.foldLeft(predicates) { (p, rule) => rule(p, tableDef) }
  }

  /**
    * When using a "solr_query = clause" we need to remove all other predicates to be pushed down.
    *
    * Example : SparkSQL : "SELECT * FROM ks.tab WHERE solr_query = data:pikachu and pkey = bob"
    *
    * In this example only the predicate solr_query can be handled in CQL. Passing pkey as well
    * will cause an exception.
    */
  private def pushOnlySolrQuery(predicates: AnalyzedPredicates, tableDef: TableDef): AnalyzedPredicates = {

    val allPredicates = predicates.handledByCassandra ++ predicates.handledBySpark

    val solrQuery: Set[Filter] = allPredicates.collect {
      case EqualTo(column, value) if column == "solr_query" => EqualTo(column, value)
    }

    //Spark 2.0 + generates an IsNotNull when we do "= literal"
    val solrIsNotNull: Set[Filter] = allPredicates.collect {
      case IsNotNull(column) if column == "solr_query" => IsNotNull(column)
    }

    if (solrQuery.nonEmpty) {
      AnalyzedPredicates(solrQuery, allPredicates -- solrQuery -- solrIsNotNull)
    } else {
      predicates
    }
  }

  // TODO: InClausePredicateRules is applied after DsePredicateRules, it may remove some of the IN clause from
  // pushdown - we might consider applying SAI later then IN rules.
  private def saiIndexes(table: TableDef): Seq[IndexDef] = {
    table.indexes.filter(_.className.contains(StorageAttachedIndex))
  }

  private def pushDownPredicates(predicates: AnalyzedPredicates, table: TableDef)
    (eligibleForPushDown: Filter => Boolean): AnalyzedPredicates = {
    val indexColumns = saiIndexes(table).map(_.targetColumn)
    val pushedEqualityPredicates = predicates.handledByCassandra.collect {
      case f if FilterOps.isEqualToPredicate(f) => FilterOps.columnName(f)
    }.to(collection.mutable.Set)

    val (handledByCassandra, handledBySpark) = predicates.handledBySpark.partition { filter =>
      lazy val columnName = FilterOps.columnName(filter)
      lazy val eligibleForSAIPushDown = eligibleForPushDown(filter) &&
        indexColumns.contains(columnName) &&
        !pushedEqualityPredicates.contains(columnName)

      val isEqualityPredicate = FilterOps.isEqualToPredicate(filter)
      if (isEqualityPredicate && eligibleForSAIPushDown) {
        pushedEqualityPredicates += columnName
      }
      eligibleForSAIPushDown
    }

    AnalyzedPredicates(predicates.handledByCassandra ++ handledByCassandra, handledBySpark)
  }

  private def pushDownSAITextPredicates(predicates: AnalyzedPredicates, table: TableDef): AnalyzedPredicates =
    pushDownPredicates(predicates, table) { filter =>
      FilterOps.isEqualToPredicate(filter) &&
        saiTextTypes.contains(table.columnByName(FilterOps.columnName(filter)).columnType)
    }

  private def pushDownSAINumericPredicates(predicates: AnalyzedPredicates, table: TableDef): AnalyzedPredicates =
    pushDownPredicates(predicates, table) { filter =>
      (FilterOps.isEqualToPredicate(filter) || FilterOps.isRangePredicate(filter)) &&
        saiNumericTypes.contains(table.columnByName(FilterOps.columnName(filter)).columnType)
    }

  /**
    * 1. IN clause for a partition key part prevents SAI push downs
    * 2. OR clause prevents any push down
    * 3. At most one equality is pushed
    * 4. Only equality is pushed if equality and range are defined
    * 5. Supported predicates:
    *  - numeric types: >, <, >=, <=, =
    *  - text types: =
    *  - set and list types: contains (non-frozen collections), = (frozen collections)
    *  - map type: contains key, contains value, key lookup result equality (map(key) == value), = (frozen collections)
    *
    *  NOTE that collection types support is going to be introduces in SPARKC-630
    */
  private def storageAttachedIndexPredicateRules(predicates: AnalyzedPredicates, table: TableDef): Seq[PredicateRule] = {
    val partitionKeyColumns = table.partitionKey.map(_.columnName).toSet
    val inClauseInPartitionKey = predicates.handledByCassandra
      .find(f => FilterOps.isInPredicate(f) && f.references.exists(partitionKeyColumns.contains))

    if (inClauseInPartitionKey.isDefined) {
      logDebug(s"SAI pushdown is not possible, the query contains IN clause " +
        s"on ${inClauseInPartitionKey.get.references.mkString(",")} partition key column(s).")
      Seq()
    } else if (saiIndexes(table).isEmpty) {
      logDebug(s"There are no SAI indexes on the given table: ${table.name}")
      Seq()
    } else {
      Seq(
        pushDownSAITextPredicates,
        pushDownSAINumericPredicates
      )
    }
  }
}