com.exasol.spark.DefaultSource.scala Maven / Gradle / Ivy
The newest version!
package com.exasol.spark
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import com.exasol.spark.util.ExasolConfiguration
import com.exasol.spark.util.ExasolConnectionManager
import com.exasol.spark.util.Types
import com.exasol.spark.writer.ExasolWriter
/**
* The default entry source for creating integration between Exasol and Spark.
*
* Additionally, it serves as a factory class to create [[ExasolRelation]]
* instances for Spark application.
*/
class DefaultSource
extends RelationProvider
with DataSourceRegister
with SchemaRelationProvider
with CreatableRelationProvider {
override def shortName(): String = "exasol"
/**
* Creates an [[ExasolRelation]] using provided Spark
* [[org.apache.spark.sql.SQLContext]] and parameters.
*
* Since the '''schema''' is not provided, it is inferred by running an Exasol
* query with `LIMIT 1` clause.
*
* @param sqlContext A Spark [[org.apache.spark.sql.SQLContext]] context
* @param parameters The parameters provided as options, `query` parameter is
* required for read
* @return An [[ExasolRelation]] relation
*/
override def createRelation(
sqlContext: SQLContext,
parameters: Map[String, String]
): BaseRelation = {
val queryString = getKeyValue("query", parameters)
val manager = createManager(parameters, sqlContext)
new ExasolRelation(sqlContext, queryString, None, manager)
}
/**
* Creates an [[ExasolRelation]] using the provided Spark
* [[org.apache.spark.sql.SQLContext]], parameters and schema.
*
* @param sqlContext A Spark [[org.apache.spark.sql.SQLContext]] context
* @param parameters The parameters provided as options, `query` parameter is
* required for read
* @param schema A user provided schema used to select columns for the
* relation
* @return An [[ExasolRelation]] relation
*/
override def createRelation(
sqlContext: SQLContext,
parameters: Map[String, String],
schema: StructType
): BaseRelation = {
val queryString = getKeyValue("query", parameters)
val manager = createManager(parameters, sqlContext)
new ExasolRelation(sqlContext, queryString, Option(schema), manager)
}
/**
* Creates an [[ExasolRelation]] after saving a
* [[org.apache.spark.sql.DataFrame]] into Exasol table.
*
* @param sqlContext A Spark [[org.apache.spark.sql.SQLContext]] context
* @param mode One of Spark save modes, [[org.apache.spark.sql.SaveMode]]
* @param parameters The parameters provided as options, `table` parameter is
* required for write
* @param data A Spark [[org.apache.spark.sql.DataFrame]] to save as a Exasol
* table
* @return An [[ExasolRelation]] relation
*/
override def createRelation(
sqlContext: SQLContext,
mode: SaveMode,
parameters: Map[String, String],
data: DataFrame
): BaseRelation = {
val tableName = getKeyValue("table", parameters)
val manager = createManager(parameters, sqlContext)
if (manager.config.drop_table) {
manager.dropTable(tableName)
}
val isTableExist = manager.tableExists(tableName)
mode match {
case SaveMode.Overwrite =>
if (!isTableExist) {
createExasolTable(data, tableName, manager)
}
manager.truncateTable(tableName)
saveDataFrame(sqlContext, data, tableName, manager)
case SaveMode.Append =>
if (!isTableExist) {
createExasolTable(data, tableName, manager)
}
saveDataFrame(sqlContext, data, tableName, manager)
case SaveMode.ErrorIfExists =>
if (isTableExist) {
throw new UnsupportedOperationException(
s"""|Table $tableName already exists. And DataFrame write mode is set to
|`errorifexists` or `default`. Please use one of other SaveMode
|modes: 'append', 'overwrite' or 'ignore'.
""".stripMargin
)
}
createExasolTable(data, tableName, manager)
saveDataFrame(sqlContext, data, tableName, manager)
case SaveMode.Ignore =>
if (!isTableExist) {
createExasolTable(data, tableName, manager)
saveDataFrame(sqlContext, data, tableName, manager)
}
}
val newParams = parameters ++ Map("query" -> s"SELECT * FROM $tableName")
createRelation(sqlContext, newParams, data.schema)
}
// Saves Spark dataframe into an Exasol table
private[this] def saveDataFrame(
sqlContext: SQLContext,
df: DataFrame,
tableName: String,
manager: ExasolConnectionManager
): Unit = {
val writer = new ExasolWriter(sqlContext.sparkContext, tableName, df.schema, manager)
val exaNodesCnt = writer.startParallel()
val newDF = repartitionPerNode(df, exaNodesCnt)
newDF.rdd.foreachPartition(iter => writer.insertPartition(iter))
}
// Creates an Exasol table that match Spark dataframe
private[this] def createExasolTable(
df: DataFrame,
tableName: String,
manager: ExasolConnectionManager
): Unit =
if (manager.config.create_table || manager.config.drop_table) {
manager.createTable(tableName, Types.createTableSchema(df.schema))
} else {
throw new UnsupportedOperationException(
s"""
|Table $tableName does not exist. Please enable table creation by setting
|'create_table' option to 'true'.
|For example:
| df.write
| .mode("overwrite")
| .option("table", "nonexist")
| .option("create_table", "true")
| .format("exasol")
| .save()
""".stripMargin
)
}
/**
* Rearrange dataframe partitions into Exasol data nodes count.
*
* If `nodesCnt` < `df.rdd.getNumPartitions` then perform
*
* {{{
* df.coalesce(nodesCnt)
* }}}
*
* in order to reduce the partition counts.
*
* If `nodesCnt` > `df.rdd.getNumPartitions` then perform
*
* {{{
* df.repartition(nodesCnt)
* }}}
*
* so that there a partition for each data node.
*
* If the number of partitions and nodes are same, then do nothing.
*/
def repartitionPerNode(df: DataFrame, nodesCnt: Int): DataFrame = {
val rddPartitionCnt = df.rdd.getNumPartitions
if (nodesCnt < rddPartitionCnt) {
df.coalesce(nodesCnt)
} else if (nodesCnt > rddPartitionCnt) {
df.repartition(nodesCnt)
} else {
df
}
}
private[this] def getKeyValue(key: String, parameters: Map[String, String]): String =
parameters.get(key) match {
case Some(str) => str
case None =>
throw new UnsupportedOperationException(
s"A $key parameter should be specified in order to run the operation"
)
}
// Creates an ExasolConnectionManager with merged configuration values.
private[this] def createManager(
parameters: Map[String, String],
sqlContext: SQLContext
): ExasolConnectionManager = {
val config = ExasolConfiguration(mergeConfigurations(parameters, sqlContext.getAllConfs))
ExasolConnectionManager(config)
}
// Merges user provided parameters with `spark.exasol.*` runtime
// configurations. If both of them define a key=value pair, then the one
// provided at runtime is used.
private[spark] def mergeConfigurations(
parameters: Map[String, String],
sparkConf: Map[String, String]
): Map[String, String] =
parameters ++ sparkConf
.filter { case (key, _) => key.startsWith(s"spark.exasol.") }
.map { case (key, value) => key.substring(s"spark.exasol.".length) -> value }
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy