com.datastax.spark.connector.cql.Schema.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-cassandra-connector_2.11 Show documentation
spark-cassandra-connector
There is a newer version: 3.0.0-alpha2
Show newest version
package com.datastax.spark.connector.cql

import java.io.IOException

import com.datastax.spark.connector._
import com.datastax.spark.connector.mapper.{ColumnMapper, DataFrameColumnMapper}
import org.apache.spark.Logging
import org.apache.spark.sql.DataFrame

import scala.collection.JavaConversions._
import scala.language.existentials
import scala.util.{Properties, Try}
import com.datastax.driver.core._
import com.datastax.spark.connector.types.{ColumnType, CounterType}
import com.datastax.spark.connector.util.NameTools
import com.datastax.spark.connector.util.Quote._

/** Abstract column / field definition.
  * Common to tables and user-defined types */
trait FieldDef extends Serializable {
  def ref: ColumnRef
  def columnName: String
  def columnType: ColumnType[_]
}

/** Cassandra structure that contains columnar information, e.g. a table or a user defined type.
  * This trait allows `ColumnMapper` to work on tables and user defined types.
  * Cassandra tables and user defined types are similar in a way data are extracted from them,
  * therefore a common interface to describe their metadata is handy. */
trait StructDef extends Serializable {

  /** Allows to specify concrete type of column in subclasses,
    * so that `columns` and `columnByName` members return concrete types.
    * Columns in tables may carry more information than columns in user defined types. */
  type Column <: FieldDef

  /** Human-readable name for easy identification of this structure.
    * Used in the error message when the column is not found.
    * E.g. a table name or a type name. */
  def name: String

  /** Sequence of column definitions in this data structure.
    * The order of the columns is implementation-defined. */
  def columns: IndexedSeq[Column]

  /** References to the columns */
  lazy val columnRefs: IndexedSeq[ColumnRef] =
    columns.map(_.ref)

  /** Names of the columns, in the same order as column definitions. */
  def columnNames: IndexedSeq[String] =
    columns.map(_.columnName)

  /** Types of the columns, in the same order as column names and column definitions. */
  def columnTypes: IndexedSeq[ColumnType[_]] =
    columns.map(_.columnType)

  /** For quickly finding a column definition by name.
    * If column is not found, throws NoSuchElementException with information
    * about the name of the column and name of the structure. */
  def columnByName: Map[String, Column] =
    columns.map(c => (c.columnName, c)).toMap.withDefault {
      columnName => throw new NoSuchElementException(s"Column $columnName not found in $name")
    }

  /** For quickly finding a column definition by index.
    * If column is not found, throws NoSuchElementException with information
    * about the requested index of the column and name of the structure. */
  def columnByIndex(index: Int): Column = {
    require(index >= 0 && index < columns.length, s"Column index $index out of bounds for $name")
    columns(index)
  }

  /** Returns the columns that are not present in the structure. */
  def missingColumns(columnsToCheck: Seq[ColumnRef]): Seq[ColumnRef] =
    for (c <- columnsToCheck if !columnByName.contains(c.columnName)) yield c
  
  /** Type of the data described by this struct */
  type ValueRepr <: AnyRef
  
  /** Creates new instance of this struct.
    * Column values must be given in the same order as columnNames */
  def newInstance(columnValues: Any*): ValueRepr
}


sealed trait ColumnRole
case object PartitionKeyColumn extends ColumnRole
case class ClusteringColumn(index: Int) extends ColumnRole
case object StaticColumn extends ColumnRole
case object RegularColumn extends ColumnRole

/** A Cassandra column metadata that can be serialized.  */
case class ColumnDef(
  columnName: String,
  columnRole: ColumnRole,
  columnType: ColumnType[_]) extends FieldDef {

  def ref: ColumnRef = ColumnName(columnName)
  def isStatic = columnRole == StaticColumn
  def isCollection = columnType.isCollection
  def isPartitionKeyColumn = columnRole == PartitionKeyColumn
  def isClusteringColumn = columnRole.isInstanceOf[ClusteringColumn]
  def isPrimaryKeyColumn = isClusteringColumn || isPartitionKeyColumn
  def isCounterColumn = columnType == CounterType

  def componentIndex = columnRole match {
    case ClusteringColumn(i) => Some(i)
    case _ => None
  }

  def cql = {
    s"${quote(columnName)} ${columnType.cqlTypeName}"
  }
}

/** Cassandra Index Metadata that can be serialized **/
case class IndexDef (
  className: String,
  target: String,
  indexName: String,
  options: Map[String, String]) extends Serializable

object ColumnDef {

  def apply(
    column: ColumnMetadata,
    columnRole: ColumnRole): ColumnDef = {

    val columnType = ColumnType.fromDriverType(column.getType)
    ColumnDef(column.getName, columnRole, columnType)
  }
}

/** A Cassandra table metadata that can be serialized. */
case class TableDef(
    keyspaceName: String,
    tableName: String,
    partitionKey: Seq[ColumnDef],
    clusteringColumns: Seq[ColumnDef],
    regularColumns: Seq[ColumnDef],
    indexes: Seq[IndexDef] = Seq.empty,
    isView: Boolean = false) extends StructDef {

  require(partitionKey.forall(_.isPartitionKeyColumn), "All partition key columns must have role PartitionKeyColumn")
  require(clusteringColumns.forall(_.isClusteringColumn), "All clustering columns must have role ClusteringColumn")
  require(regularColumns.forall(!_.isPrimaryKeyColumn), "Regular columns cannot have role PrimaryKeyColumn")

  val allColumns = regularColumns ++ clusteringColumns ++ partitionKey

  private val indexesForTarget: Map[String, Seq[IndexDef]] = indexes.groupBy(_.target)

  /**
    * Contains indices that can be directly mapped to single column, namely indices with a handled column
    * name as a target. Indices that can not be mapped to a single column are dropped.
    */
  private val indexesForColumnDef: Map[ColumnDef, Seq[IndexDef]] = {
    indexesForTarget.flatMap {
      case (target, indexes) => Try(columnByName(target) -> indexes).toOption
    }
  }

  def isIndexed(column: String): Boolean = {
    indexesForTarget.contains(column)
  }

  def isIndexed(column: ColumnDef): Boolean = {
    indexesForColumnDef.contains(column)
  }

  val indexedColumns: Seq[ColumnDef] = {
    indexesForColumnDef.keys.toSeq
  }

  override type Column = ColumnDef

  override def name: String = s"$keyspaceName.$tableName"
  
  lazy val primaryKey: IndexedSeq[ColumnDef] =
    (partitionKey ++ clusteringColumns).toIndexedSeq

  override lazy val columns: IndexedSeq[ColumnDef] =
    (primaryKey ++ regularColumns).toIndexedSeq

  override lazy val columnByName: Map[String, ColumnDef] =
    super.columnByName

  def cql = {
    val columnList = columns.map(_.cql).mkString(s",${Properties.lineSeparator}  ")
    val partitionKeyClause = partitionKey.map(_.columnName).map(quote).mkString("(", ", ", ")")
    val clusteringColumnNames = clusteringColumns.map(_.columnName).map(quote)
    val primaryKeyClause = (partitionKeyClause +: clusteringColumnNames).mkString(", ")

    s"""CREATE TABLE ${quote(keyspaceName)}.${quote(tableName)} (
       |  $columnList,
       |  PRIMARY KEY ($primaryKeyClause)
       |)""".stripMargin
  }

  type ValueRepr = CassandraRow
  
  def newInstance(columnValues: Any*): CassandraRow = {
    new CassandraRow(CassandraRowMetadata.fromColumnNames(columnNames), columnValues.toIndexedSeq.map(_.asInstanceOf[AnyRef]))
  }
}

object TableDef {

  /** Constructs a table definition based on the mapping provided by
    * appropriate [[com.datastax.spark.connector.mapper.ColumnMapper]] for the given type. */
  def fromType[T : ColumnMapper](keyspaceName: String, tableName: String): TableDef =
    implicitly[ColumnMapper[T]].newTable(keyspaceName, tableName)

  def fromDataFrame(dataFrame: DataFrame, keyspaceName: String, tableName: String): TableDef =
    new DataFrameColumnMapper(dataFrame.schema).newTable(keyspaceName, tableName)
}

/** A Cassandra keyspace metadata that can be serialized. */
case class KeyspaceDef(keyspaceName: String, tables: Set[TableDef]) {
  lazy val tableByName = tables.map(t => (t.tableName, t)).toMap
}

case class Schema(clusterName: String, keyspaces: Set[KeyspaceDef]) {

  /** Returns a map from keyspace name to keyspace metadata */
  lazy val keyspaceByName: Map[String, KeyspaceDef] =
    keyspaces.map(k => (k.keyspaceName, k)).toMap

  /** All tables from all keyspaces */
  lazy val tables: Set[TableDef] =
    for (keyspace <- keyspaces; table <- keyspace.tables) yield table

}

object Schema extends Logging {

   private def fetchPartitionKey(table: AbstractTableMetadata): Seq[ColumnDef] = {
    for (column <- table.getPartitionKey) yield { ColumnDef(column, PartitionKeyColumn) }
  }

  private def fetchClusteringColumns(table: AbstractTableMetadata): Seq[ColumnDef] = {
    for ((column, index) <- table.getClusteringColumns.zipWithIndex) yield {
      ColumnDef(column, ClusteringColumn(index))
    }
  }

  private def fetchRegularColumns(table: AbstractTableMetadata): Seq[ColumnDef] = {
    val primaryKey = table.getPrimaryKey.toSet
    val regularColumns = table.getColumns.filterNot(primaryKey.contains)
    for (column <- regularColumns) yield {
      if (column.isStatic)
        ColumnDef(column, StaticColumn)
      else
        ColumnDef(column, RegularColumn)
    }
  }

  /** Fetches database schema from Cassandra. Provides access to keyspace, table and column metadata.
    *
    * @param keyspaceName if defined, fetches only metadata of the given keyspace
    * @param tableName if defined, fetches only metadata of the given table
    */
  def fromCassandra(
    connector: CassandraConnector,
    keyspaceName: Option[String] = None,
    tableName: Option[String] = None): Schema = {

    def isKeyspaceSelected(keyspace: KeyspaceMetadata): Boolean =
      keyspaceName match {
        case None => true
        case Some(name) => keyspace.getName == name
      }

    def isTableSelected(table: AbstractTableMetadata): Boolean =
      tableName match {
        case None => true
        case Some(name) => table.getName == name
      }

    def fetchTables(keyspace: KeyspaceMetadata): Set[TableDef] =
      for (table <- (keyspace.getTables.toSet ++ keyspace.getMaterializedViews.toSet)
        if isTableSelected(table)) yield {
          val partitionKey = fetchPartitionKey(table)
          val clusteringColumns = fetchClusteringColumns(table)
          val regularColumns = fetchRegularColumns(table)
          val indexDefs = getIndexDefs(table)

          val isView = table match {
            case x: MaterializedViewMetadata => true
            case _ => false
          }

          TableDef(
            keyspace.getName,
            table.getName,
            partitionKey,
            clusteringColumns,
            regularColumns,
            indexDefs,
            isView)
        }

    def fetchKeyspaces(metadata: Metadata): Set[KeyspaceDef] =
      for (keyspace <- metadata.getKeyspaces.toSet if isKeyspaceSelected(keyspace)) yield
        KeyspaceDef(keyspace.getName, fetchTables(keyspace))

    /**
      * See [[com.datastax.driver.core.Metadata#handleId]]
      */
    def handleId(table: TableMetadata, columnName: String): String =
      Option(table.getColumn(columnName)).map(_.getName).getOrElse(columnName)

    def getIndexDefs(tableOrView: AbstractTableMetadata): Seq[IndexDef] = tableOrView match {
      case table: TableMetadata =>
        table.getIndexes.map(index => {
          val target = handleId(table, index.getTarget)
          IndexDef(index.getIndexClassName, target, index.getName, Map.empty)
        }).toSeq
      case view: MaterializedViewMetadata => Seq.empty
    }

    connector.withClusterDo { cluster =>
      val clusterName = cluster.getMetadata.getClusterName
      logDebug(s"Retrieving database schema from cluster $clusterName...")
      val keyspaces = fetchKeyspaces(cluster.getMetadata)
      logDebug(s"${keyspaces.size} keyspaces fetched from cluster $clusterName: " +
        s"${keyspaces.map(_.keyspaceName).mkString("{", ",", "}")}")
      Schema(clusterName, keyspaces)
    }
  }


  /**
    * Fetches a TableDef for a particular Cassandra Table throws an
    * exception with name options if the table is not found.
    */
  def tableFromCassandra(
    connector: CassandraConnector,
    keyspaceName: String,
    tableName: String): TableDef = {

    fromCassandra(connector, Some(keyspaceName), Some(tableName)).tables.headOption match {
      case Some(t) => t
      case None =>
        val metadata: Metadata = connector.withClusterDo(_.getMetadata)
        val suggestions = NameTools.getSuggestions(metadata, keyspaceName, tableName)
        val errorMessage = NameTools.getErrorString(keyspaceName, tableName, suggestions)
        throw new IOException(errorMessage)
      }
    }
}